diff --git a/thirdparty/SZ/COPYRIGHT.txt b/thirdparty/SZ/COPYRIGHT.txt
index de90efe3c09def6f7a3bdd1d0dac7a0b11d82716..8baddd9b7e1dbd6348fe7c56dfc370ce2799c4a0 100644
--- a/thirdparty/SZ/COPYRIGHT.txt
+++ b/thirdparty/SZ/COPYRIGHT.txt
@@ -1,8 +1,9 @@
 Copyright © 2016 , UChicago Argonne, LLC
 All Rights Reserved
-[SZ, Version 1.3]
+[SZ, Version 1.4]
 Sheng Di
 Dingwen Tao
+Xin Liang
 Franck Cappello
 Argonne National Laboratory
 
diff --git a/thirdparty/SZ/sz/include/TightDataPointStorageD.h b/thirdparty/SZ/sz/include/TightDataPointStorageD.h
index 4fc5be82efefa902dee98e8f131d0f420daf29f2..0863fb08406ccaafea81d5e62a2160a710933421 100644
--- a/thirdparty/SZ/sz/include/TightDataPointStorageD.h
+++ b/thirdparty/SZ/sz/include/TightDataPointStorageD.h
@@ -23,6 +23,8 @@ typedef struct TightDataPointStorageD
 	char reqLength;	
 	char radExpo; //used to compute reqLength based on segmented precisions in "pw_rel_compression"
 
+	double minLogValue;
+
 	int stateNum;
 	int allNodes;
 
@@ -52,6 +54,10 @@ typedef struct TightDataPointStorageD
 	
 	unsigned char* pwrErrBoundBytes;
 	int pwrErrBoundBytes_size;
+		
+	unsigned char* raBytes;
+	size_t raBytes_size;
+	
 } TightDataPointStorageD;
 
 void new_TightDataPointStorageD_Empty(TightDataPointStorageD **self);
diff --git a/thirdparty/SZ/sz/include/TightDataPointStorageF.h b/thirdparty/SZ/sz/include/TightDataPointStorageF.h
index eca1717b98a3d8fc53b41f09c48e92e2969d1403..7e5df7d0de045c6c60abc5ec44791291709b006d 100644
--- a/thirdparty/SZ/sz/include/TightDataPointStorageF.h
+++ b/thirdparty/SZ/sz/include/TightDataPointStorageF.h
@@ -34,6 +34,8 @@ typedef struct TightDataPointStorageF
 	unsigned char* rtypeArray;
 	size_t rtypeArray_size;
 	
+	float minLogValue;
+
 	unsigned char* typeArray; //its size is dataSeriesLength/4 (or xxx/4+1) 
 	size_t typeArray_size;
 	
@@ -55,6 +57,9 @@ typedef struct TightDataPointStorageF
 	unsigned char* pwrErrBoundBytes;
 	int pwrErrBoundBytes_size;
 	
+	unsigned char* raBytes;
+	size_t raBytes_size;
+	
 } TightDataPointStorageF;
 
 void new_TightDataPointStorageF_Empty(TightDataPointStorageF **self);
diff --git a/thirdparty/SZ/sz/include/TypeManager.h b/thirdparty/SZ/sz/include/TypeManager.h
index 6be71f8c862c78aa155b7ad384f5c5f5203f9ef1..7c543d3f82aab34cad5ad8eb44e779872f02cf86 100644
--- a/thirdparty/SZ/sz/include/TypeManager.h
+++ b/thirdparty/SZ/sz/include/TypeManager.h
@@ -19,8 +19,10 @@ extern "C" {
 
 //TypeManager.c
 size_t convertIntArray2ByteArray_fast_1b(unsigned char* intArray, size_t intArrayLength, unsigned char **result);
+size_t convertIntArray2ByteArray_fast_1b_to_result(unsigned char* intArray, size_t intArrayLength, unsigned char *result);
 void convertByteArray2IntArray_fast_1b(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
 size_t convertIntArray2ByteArray_fast_2b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result);
+size_t convertIntArray2ByteArray_fast_2b_inplace(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char *result);
 void convertByteArray2IntArray_fast_2b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
 size_t convertIntArray2ByteArray_fast_3b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result);
 void convertByteArray2IntArray_fast_3b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
diff --git a/thirdparty/SZ/sz/include/callZlib.h b/thirdparty/SZ/sz/include/callZlib.h
index 0622d9809cdcad17eaeacc02942c2810b843a77c..1aede548c6e5d7aa30475799d1af994a3ccddad4 100644
--- a/thirdparty/SZ/sz/include/callZlib.h
+++ b/thirdparty/SZ/sz/include/callZlib.h
@@ -19,6 +19,8 @@ extern "C" {
 
 #include <stdio.h>
 
+int isZlibFormat(unsigned char magic1, unsigned char magic2);
+
 //callZlib.c
 unsigned long zlib_compress(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
 unsigned long zlib_compress2(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
diff --git a/thirdparty/SZ/sz/include/dataCompression.h b/thirdparty/SZ/sz/include/dataCompression.h
index 1eb0f30f06b97412d5e2a9f95887709f75fba668..6a2da0b57c3a41f68c77abd87a4413dca6ebce2e 100644
--- a/thirdparty/SZ/sz/include/dataCompression.h
+++ b/thirdparty/SZ/sz/include/dataCompression.h
@@ -77,6 +77,18 @@ int computeBlockEdgeSize_3D(int segmentSize);
 int computeBlockEdgeSize_2D(int segmentSize);
 int initRandomAccessBytes(unsigned char* raBytes);
 
+int generateLossyCoefficients_float(float* oriData, double precision, size_t nbEle, int* reqBytesLength, int* resiBitsLength, float* medianValue, float* decData);
+int compressExactDataArray_float(float* oriData, double precision, size_t nbEle, unsigned char** leadArray, unsigned char** midArray, unsigned char** resiArray, 
+int reqLength, int reqBytesLength, int resiBitsLength, float medianValue);
+
+void decompressExactDataArray_float(unsigned char* leadNum, unsigned char* exactMidBytes, unsigned char* residualMidBits, size_t nbEle, int reqLength, float medianValue, float** decData);
+
+int generateLossyCoefficients_double(double* oriData, double precision, size_t nbEle, int* reqBytesLength, int* resiBitsLength, double* medianValue, double* decData);
+int compressExactDataArray_double(double* oriData, double precision, size_t nbEle, unsigned char** leadArray, unsigned char** midArray, unsigned char** resiArray, 
+int reqLength, int reqBytesLength, int resiBitsLength, double medianValue);
+
+void decompressExactDataArray_double(unsigned char* leadNum, unsigned char* exactMidBytes, unsigned char* residualMidBits, size_t nbEle, int reqLength, double medianValue, double** decData);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/thirdparty/SZ/sz/include/pastriD.h b/thirdparty/SZ/sz/include/pastriD.h
index 1a881e685f66e0d92ec45715a232ee2d1350d091..3ee2813997b308a3c85c13eda317269017e6657a 100644
--- a/thirdparty/SZ/sz/include/pastriD.h
+++ b/thirdparty/SZ/sz/include/pastriD.h
@@ -10,11 +10,11 @@ static inline int64_t pastri_double_quantize(double x, double binSize){
   
   half.d=0.5;
   
-  //printf("pastri_double_quantize:\nx=%lf  x=0x%lx\n",x,(*((uint64_t *)(&x))));
-  //printf("sign(x):0x%lx\n", x);
-  //printf("0.5:0x%lx\n", (*((uint64_t *)(&half))));
+//  //printf("pastri_double_quantize:\nx=%lf  x=0x%lx\n",x,(*((uint64_t *)(&x))));
+//  //printf("sign(x):0x%lx\n", x);
+//  //printf("0.5:0x%lx\n", (*((uint64_t *)(&half))));
   half.ui64 |= (u1.ui64 & (uint64_t)0x8000000000000000);
-  //printf("sign(x)*0.5:0x%lx\n", (*((uint64_t *)(&half))));
+//  //printf("sign(x)*0.5:0x%lx\n", (*((uint64_t *)(&half))));
   return (int64_t)(x + half.d);
 }
 
@@ -26,10 +26,10 @@ static inline void pastri_double_PatternMatch(double*data,pastri_params* p,pastr
   bp->nonZeros=0;
   int i,sb;
   for(i=0;i<p->bSize;i++){
-    //printf("data[%d] = %.16lf\n",i,data[i]);//DEBUG
+//    //printf("data[%d] = %.16lf\n",i,data[i]);//DEBUG
     if(abs_FastD(data[i])>p->usedEb){
       bp->nonZeros++;
-      //if(DEBUG)printf("data[%d]:%.6e\n",i,data[i]); //DEBUG
+      ////if(DEBUG)printf("data[%d]:%.6e\n",i,data[i]); //DEBUG
     }
     if(abs_FastD(data[i])>absExt){
       absExt=abs_FastD(data[i]);
@@ -42,40 +42,40 @@ static inline void pastri_double_PatternMatch(double*data,pastri_params* p,pastr
   double patternExt=data[extIdx];
   bp->binSize=2*p->usedEb;
   
-  //if(DEBUG){printf("Extremum  : data[%d] = %.6e\n",extIdx,patternExt);} //DEBUG
-  //if(DEBUG){printf("patternIdx: %d\n",patternIdx);} //DEBUG
+  ////if(DEBUG){printf("Extremum  : data[%d] = %.6e\n",extIdx,patternExt);} //DEBUG
+  ////if(DEBUG){printf("patternIdx: %d\n",patternIdx);} //DEBUG
   
-  //if(DEBUG){for(i=0;i<p->sbSize;i++){printf("pattern[%d]=data[%d]=%.6e Quantized:%d\n",i,patternIdx+i,data[patternIdx+i],pastri_double_quantize(data[patternIdx+i]/binSize)  );}   }//DEBUG
+  ////if(DEBUG){for(i=0;i<p->sbSize;i++){printf("pattern[%d]=data[%d]=%.6e Quantized:%d\n",i,patternIdx+i,data[patternIdx+i],pastri_double_quantize(data[patternIdx+i]/binSize)  );}   }//DEBUG
   
   //int64_t *patternQ=(int64_t*)(outBuf+15);  //Possible Improvement!
 
   
   for(i=0;i<p->sbSize;i++){
     patternQ[i]=pastri_double_quantize(data[patternIdx+i],bp->binSize);
-    if(D_W){printf("patternQ[%d]=%ld\n",i,patternQ[i]);}
+    //if(D_W){printf("patternQ[%d]=%ld\n",i,patternQ[i]);}
   }
   
   bp->patternBits=bitsNeeded_double((abs_FastD(patternExt)/bp->binSize)+1)+1;
   bp->scaleBits=bp->patternBits;
   bp->scalesBinSize=1/(double)(((uint64_t)1<<(bp->scaleBits-1))-1);
-  //if(DEBUG){printf("(patternExt/binSize)+1: %.6e\n",(patternExt/binSize)+1);} //DEBUG
-  //if(DEBUG){printf("scaleBits=patternBits: %d\n",scaleBits);} //DEBUG
-  if(D_W){printf("scalesBinSize: %.6e\n",bp->scalesBinSize);} //DEBUG
+  ////if(DEBUG){printf("(patternExt/binSize)+1: %.6e\n",(patternExt/binSize)+1);} //DEBUG
+  ////if(DEBUG){printf("scaleBits=patternBits: %d\n",scaleBits);} //DEBUG
+  //if(D_W){printf("scalesBinSize: %.6e\n",bp->scalesBinSize);} //DEBUG
   
   //Calculate Scales.
   //The index part of the input buffer will be reused to hold Scale, Pattern, etc. values.
   int localExtIdx=extIdx%p->sbSize; //Local extremum index. This is not the actual extremum of the current sb, but rather the index that correspond to the global (block) extremum.
   //int64_t *scalesQ=(int64_t*)(outBuf+15+p->sbSize*8);  //Possible Improvement!
   int patternExtZero=(patternExt==0);
-  //if(DEBUG){printf("patternExtZero: %d\n",patternExtZero);} //DEBUG
+  ////if(DEBUG){printf("patternExtZero: %d\n",patternExtZero);} //DEBUG
   for(sb=0;sb<p->sbNum;sb++){
     //scales[sb]=data[sb*p->sbSize+localExtIdx]/patternExt;
     //scales[sb]=patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt;
     //assert(scales[sb]<=1);
     scalesQ[sb]=pastri_double_quantize((patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt),bp->scalesBinSize);
-    if(D_W){printf("scalesQ[%d]=%ld\n",sb,scalesQ[sb]);}
+    //if(D_W){printf("scalesQ[%d]=%ld\n",sb,scalesQ[sb]);}
   }
-  //if(DEBUG){for(i=0;i<p->sbSize;i++){printf("scalesQ[%d]=%ld \n",i,scalesQ[i]);}} //DEBUG
+  ////if(DEBUG){for(i=0;i<p->sbSize;i++){printf("scalesQ[%d]=%ld \n",i,scalesQ[i]);}} //DEBUG
 
   //int64_t *ECQ=(int64_t*)(outBuf+p->bSize*8); //ECQ is written into outBuf, just be careful when handling it.
 
@@ -92,7 +92,7 @@ static inline void pastri_double_PatternMatch(double*data,pastri_params* p,pastr
       double absECQ=abs_FastD(ECQ[_1DIdx]);
       if(absECQ > bp->ECQExt)
         bp->ECQExt=absECQ;
-      //if(DEBUG){printf("EC[%d]: %.6e Quantized:%ld \n",_1DIdx,(scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-data[_1DIdx]),ECQ[_1DIdx]);} //DEBUG
+      ////if(DEBUG){printf("EC[%d]: %.6e Quantized:%ld \n",_1DIdx,(scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-data[_1DIdx]),ECQ[_1DIdx]);} //DEBUG
       switch (ECQ[_1DIdx]){
         case 0:
           //ECQ0s++; //Currently not needed
@@ -117,8 +117,8 @@ static inline void pastri_double_PatternMatch(double*data,pastri_params* p,pastr
       _1DIdx=sb*p->sbSize+i;
       double decompressed=scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-ECQ[_1DIdx]*binSize;
       if(abs_FastD(decompressed-data[_1DIdx])>(p->usedEb)){
-        printf("p->usedEb=%.6e\n",p->usedEb);
-        printf("data[%d]=%.6e decompressed[%d]=%.6e diff=%.6e\n",_1DIdx,data[_1DIdx],_1DIdx,decompressed,abs_FastD(data[_1DIdx]-decompressed));
+        //printf("p->usedEb=%.6e\n",p->usedEb);
+        //printf("data[%d]=%.6e decompressed[%d]=%.6e diff=%.6e\n",_1DIdx,data[_1DIdx],_1DIdx,decompressed,abs_FastD(data[_1DIdx]-decompressed));
         assert(0);
       }
     }
@@ -174,8 +174,8 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
   //*(uint16_t*)(&outBuf[5])=p->idxOffset[2];
   //*(uint16_t*)(&outBuf[7])=p->idxOffset[3];
     
-  if(D_W){printf("ECQ0s:%d ECQ1s:%d ECQOthers:%d Total:%d\n",p->bSize-bp->ECQ1s-bp->ECQOthers,bp->ECQ1s,bp->ECQOthers,p->bSize);} //DEBUG
-  if(D_W){printf("numOutliers:%d\n",bp->numOutliers);} //DEBUG
+  //if(D_W){printf("ECQ0s:%d ECQ1s:%d ECQOthers:%d Total:%d\n",p->bSize-bp->ECQ1s-bp->ECQOthers,bp->ECQ1s,bp->ECQOthers,p->bSize);} //DEBUG
+  //if(D_W){printf("numOutliers:%d\n",bp->numOutliers);} //DEBUG
   
   //****************************************************************************************
   //if(0){ //DEBUG
@@ -183,8 +183,8 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
   if((UCSparseBytes<UCNonSparseBytes) && (UCSparseBytes<CSparseBytes) && (UCSparseBytes<CNonSparseBytes) ){ 
     //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, indexOffsets, nonZeros, indexes, data
     *numOutBytes=UCSparseBytes;
-    if(D_G){printf("UCSparse\n");} //DEBUG
-    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    //if(D_G){printf("UCSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
     outBuf[0]=0; //mode
     
     //*(uint16_t*)(&outBuf[9])=nonZeros;
@@ -216,7 +216,7 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
             }
           }
     
-    if(D_G)printf("UCSparseBytes:%d \n",UCSparseBytes); //DEBUG
+    //if(D_G)printf("UCSparseBytes:%d \n",UCSparseBytes); //DEBUG
     
   //****************************************************************************************
   //}else if(0){ //DEBUG
@@ -224,23 +224,23 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
   }else if((UCNonSparseBytes<UCSparseBytes) && (UCNonSparseBytes<CSparseBytes) && (UCNonSparseBytes<CNonSparseBytes) ){ 
     //Uncompressed, NonSparse bits. Includes: mode, indexOffsets, data
     *numOutBytes=UCNonSparseBytes;
-    if(D_G){printf("UCNonSparse\n");} //DEBUG
-    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    //if(D_G){printf("UCNonSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
     outBuf[0]=1; //mode
     
     //memcpy(&outBuf[9], &inBuf[p->bSize*8], UCNonSparseBytes-9);
     memcpy(&outBuf[1], data, p->bSize*p->dataSize);
     
-    if(D_G)printf("UCNonSparseBytes:%d \n",UCNonSparseBytes); //DEBUG
+    //if(D_G)printf("UCNonSparseBytes:%d \n",UCNonSparseBytes); //DEBUG
     /*
     for(i=0;i<UCNonSparseBytes-17;i++){
-      printf("%d ",inBuf[p->bSize*8+i]);
+      //printf("%d ",inBuf[p->bSize*8+i]);
     }
-    printf("\n");
+    //printf("\n");
     for(i=0;i<UCNonSparseBytes-17;i++){
-      printf("%d ",outBuf[17+i]);
+      //printf("%d ",outBuf[17+i]);
     }
-    printf("\n");
+    //printf("\n");
     */
   //****************************************************************************************
   //}else if(1){ //DEBUG
@@ -248,9 +248,9 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
   }else if((CSparseBytes<UCNonSparseBytes) && (CSparseBytes<UCSparseBytes) && (CSparseBytes<CNonSparseBytes) ){ 
     //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
     *numOutBytes=CSparseBytes;
-    if(D_G){printf("CSparse\n");} //DEBUG
-    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
-    //if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    //if(D_G){printf("CSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    ////if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
     outBuf[0]=2; //mode
     
     ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
@@ -269,17 +269,17 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
     //Now, we are at the end of 9th byte.
     bitPos=9*8; 
     
-    //if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
 
     for(i=0;i<p->sbSize;i++){
       writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
     }
-    //if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
     for(i=0;i<p->sbNum;i++){
       writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
     }
-    //if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
-    //if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    ////if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG)printf("ECQBits:%d\n",ECQBits);
     switch(bp->ECQBits){
       case 2:
         for(i=0;i<p->bSize;i++){
@@ -287,7 +287,7 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
             case 0:
               break;
             case 1:
-              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x0\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x0\n",i,ECQ[i]); //DEBUG
               writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
               //writeBits_Fast(outBuf,&bitPos,2,0x10);
               //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
@@ -295,7 +295,7 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
               writeBits_Fast(outBuf,&bitPos,1,0);//0x00
               break;
             case -1:
-              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
               writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
               //writeBits_Fast(outBuf,&bitPos,2,0x11);
               //writeBits_Fast(outBuf,&bitPos,2,1);//0x01
@@ -314,7 +314,7 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
           case 0:
             break;
           case 1:
-            //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x00\n",i,ECQ[i]); //DEBUG
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x00\n",i,ECQ[i]); //DEBUG
             writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
             //writeBits_Fast(outBuf,&bitPos,3,0);//0x000
             //writeBits_Fast(outBuf,&bitPos,1,0);
@@ -322,7 +322,7 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
             writeBits_Fast(outBuf,&bitPos,1,0);
             break;
           case -1:
-            //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01\n",i,ECQ[i]); //DEBUG
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01\n",i,ECQ[i]); //DEBUG
             writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
             //writeBits_Fast(outBuf,&bitPos,3,1);//0x001
             //writeBits_Fast(outBuf,&bitPos,1,0);
@@ -330,7 +330,7 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
             writeBits_Fast(outBuf,&bitPos,1,1);
             break;
           default:
-            //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1 0x%lx\n",i,ECQ[i],ECQ[i]); //DEBUG
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1 0x%lx\n",i,ECQ[i],ECQ[i]); //DEBUG
             writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
             //writeBits_Fast(outBuf,&bitPos,2+ECQBits,((uint64_t)0x11<<ECQBits)|ECQ[i]);
             //writeBits_Fast(outBuf,&bitPos,2+ECQBits,(ECQ[i]&((uint64_t)0x00<<ECQBits))|((uint64_t)0x01<<ECQBits));
@@ -343,15 +343,15 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
       break;
     }
     
-    //if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
-    if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    ////if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
           
 
     uint32_t bytePos=(bitPos+7)/8;
     //*(uint32_t*)(&outBuf[9])=bytePos;
     *(uint32_t*)(&outBuf[1])=bytePos;
     
-    if(D_G)printf("bitPos:%ld CSparseBits:%d bytePos:%d CSparseBytes:%d\n",bitPos,CSparseBits,bytePos,CSparseBytes); //DEBUG
+    //if(D_G)printf("bitPos:%ld CSparseBits:%d bytePos:%d CSparseBytes:%d\n",bitPos,CSparseBits,bytePos,CSparseBytes); //DEBUG
     if(D_G){assert(bitPos==CSparseBits);}
     
   //****************************************************************************************
@@ -359,9 +359,9 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
   }else { 
     //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
     *numOutBytes=CNonSparseBytes;
-    if(D_G){printf("CNonSparse\n");} //DEBUG
-    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
-    //if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    //if(D_G){printf("CNonSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    ////if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
     outBuf[0]=3; //mode
     
     ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
@@ -374,33 +374,33 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
     outBuf[6]=bp->ECQBits;
     bitPos=7*8; //Currently, we are at the end of 7th byte.
     
-    //if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
 
     for(i=0;i<p->sbSize;i++){
       writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
     }
-    //if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
     for(i=0;i<p->sbNum;i++){
       writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
     }
-    //if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
-    //if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    ////if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG)printf("ECQBits:%d\n",ECQBits);
     switch(bp->ECQBits){
       case 2:
         for(i=0;i<p->bSize;i++){
           switch(ECQ[i]){
             case 0:
-              //if(DEBUG)printf("Index:%d ECQ:%d Written:0x1\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x1\n",i,ECQ[i]); //DEBUG
               writeBits_Fast(outBuf,&bitPos,1,1);//0x1
               break;
             case 1:
-              //if(DEBUG)printf("Index:%d ECQ:%d Written:0x00\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x00\n",i,ECQ[i]); //DEBUG
               //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
               writeBits_Fast(outBuf,&bitPos,1,0);
               writeBits_Fast(outBuf,&bitPos,1,0);
               break;
             case -1:
-              //if(DEBUG)printf("Index:%d ECQ:%d Written:0x01\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x01\n",i,ECQ[i]); //DEBUG
               //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
               writeBits_Fast(outBuf,&bitPos,1,0);
               writeBits_Fast(outBuf,&bitPos,1,1);
@@ -412,60 +412,60 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
         }
         break;
       default: //ECQBits>2
-        //if(DEBUG) printf("AMG_W1:bitPos:%ld\n",bitPos); //DEBUG
+        ////if(DEBUG) printf("AMG_W1:bitPos:%ld\n",bitPos); //DEBUG
         for(i=0;i<p->bSize;i++){
-          //if(DEBUG){printf("AMG_W3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
-          //if(DEBUG) printf("AMG_W2:bitPos:%ld\n",bitPos); //DEBUG
-          //if(DEBUG) printf("ECQ[%d]:%ld\n",i,ECQ[i]); //DEBUG
+          ////if(DEBUG){printf("AMG_W3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+          ////if(DEBUG) printf("AMG_W2:bitPos:%ld\n",bitPos); //DEBUG
+          ////if(DEBUG) printf("ECQ[%d]:%ld\n",i,ECQ[i]); //DEBUG
           switch(ECQ[i]){
             case 0:
-              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
-              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
               //temp1=bitPos;
               writeBits_Fast(outBuf,&bitPos,1,1);  //0x1
               //wVal=1; writeBits_Fast(outBuf,&bitPos,1,wVal); //0x1
-              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
               break;
             case 1:
-              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x000\n",i,ECQ[i]); //DEBUG
-              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x000\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
               //temp1=bitPos;
               //writeBits_Fast(outBuf,&bitPos,3,0); //0x000
               writeBits_Fast(outBuf,&bitPos,1,0);
               writeBits_Fast(outBuf,&bitPos,1,0);
               writeBits_Fast(outBuf,&bitPos,1,0);
               //wVal=0; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x000
-              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
               break;
             case -1:
-              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x001\n",i,ECQ[i]); //DEBUG
-              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x001\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
               //temp1=bitPos;
               //writeBits_Fast(outBuf,&bitPos,3,8); //0x001
               writeBits_Fast(outBuf,&bitPos,1,0); 
               writeBits_Fast(outBuf,&bitPos,1,0); 
               writeBits_Fast(outBuf,&bitPos,1,1); 
               //wVal=8; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x001
-              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
               break;
             default:
-              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01 0x%lx\n",i,ECQ[i]); //DEBUG
-              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01 0x%lx\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
               //temp1=bitPos;
               //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
               writeBits_Fast(outBuf,&bitPos,1,0); 
               writeBits_Fast(outBuf,&bitPos,1,1); 
               //wVal=2; writeBits_Fast(outBuf,&bitPos,2,wVal); //0x01
               writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
-              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
               break;
           }
         }
         break;
     }
     
-    //if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
-    if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    ////if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
     
           
 
@@ -473,20 +473,20 @@ static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t*
     //*(uint32_t*)(&outBuf[9])=bytePos;
     *(uint32_t*)(&outBuf[1])=bytePos;
     
-    if(D_G)printf("bitPos:%ld CNonSparseBits:%d bytePos:%d CNonSparseBytes:%d\n",bitPos,CNonSparseBits,bytePos,CNonSparseBytes); //DEBUG
+    //if(D_G)printf("bitPos:%ld CNonSparseBits:%d bytePos:%d CNonSparseBytes:%d\n",bitPos,CNonSparseBits,bytePos,CNonSparseBytes); //DEBUG
     if(D_G){assert(bitPos==CNonSparseBits);}
     
   }
-  //for(i=213;i<233;i++)if(DEBUG)printf("AMG_WE:bitPos:%d buffer[%d]=0x%lx\n",i*8,i,*(uint64_t*)(&outBuf[i])); //DEBUG
+  ////for(i=213;i<233;i++)if(DEBUG)printf("AMG_WE:bitPos:%d buffer[%d]=0x%lx\n",i*8,i,*(uint64_t*)(&outBuf[i])); //DEBUG
   
 }
 static inline int pastri_double_Compress(unsigned char*inBuf,pastri_params *p,unsigned char*outBuf,int *numOutBytes){
   pastri_blockParams bp;
 
-  if(D_G2){printf("Parameters: dataSize:%d\n",p->dataSize);}  //DEBUG
-  if(D_G2){printf("Parameters: bfs:%d %d %d %d originalEb:%.3e\n",p->bf[0],p->bf[1],p->bf[2],p->bf[3],p->usedEb);}  //DEBUG
-  if(D_G2){printf("Parameters: idxRanges:%d %d %d %d\n",p->idxRange[0],p->idxRange[1],p->idxRange[2],p->idxRange[3]);} //DEBUG
-  if(D_G2){printf("Parameters: sbSize:%d sbNum:%d bSize:%d\n",p->sbSize,p->sbNum,p->bSize); }//DEBUG
+  //if(D_G2){printf("Parameters: dataSize:%d\n",p->dataSize);}  //DEBUG
+  //if(D_G2){printf("Parameters: bfs:%d %d %d %d originalEb:%.3e\n",p->bf[0],p->bf[1],p->bf[2],p->bf[3],p->usedEb);}  //DEBUG
+  //if(D_G2){printf("Parameters: idxRanges:%d %d %d %d\n",p->idxRange[0],p->idxRange[1],p->idxRange[2],p->idxRange[3]);} //DEBUG
+  //if(D_G2){printf("Parameters: sbSize:%d sbNum:%d bSize:%d\n",p->sbSize,p->sbNum,p->bSize); }//DEBUG
   
   int64_t patternQ[MAX_PS_SIZE];
   int64_t scalesQ[MAX_PS_SIZE];
@@ -566,7 +566,7 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
   switch(inBuf[0]){
     //R:UCSparse
     case 0:
-      if(D_G){printf("\nDC:UCSparse\n");} //DEBUG
+      //if(D_G){printf("\nDC:UCSparse\n");} //DEBUG
       //bp->nonZeros=*(uint16_t*)(&inBuf[9]);
       //bytePos=11;
       bp->nonZeros=*(uint16_t*)(&inBuf[1]);
@@ -591,19 +591,19 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
         data[_1DIdx]=*(double*)(&inBuf[bytePos]);
         bytePos+=8; 
       }
-      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
       break;
     //R:UCNonSparse
     case 1:
-      if(D_G){printf("\nDC:UCNonSparse\n");} //DEBUG
+      //if(D_G){printf("\nDC:UCNonSparse\n");} //DEBUG
       //memcpy(&outBuf[p->bSize*8], &inBuf[9], p->bSize*8);
       memcpy(data, &inBuf[1], p->bSize*8);
       bytePos=p->bSize*8;
-      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
       break;
     //R:CSparse
     case 2:
-      if(D_G){printf("\nDC:CSparse\n");} //DEBUG
+      //if(D_G){printf("\nDC:CSparse\n");} //DEBUG
       //for(j=0;j<p->bSize;j++){
       //  data[j]=0;
       //}
@@ -614,27 +614,27 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
       bp->patternBits=inBuf[5];
       bp->ECQBits=inBuf[6];
       
-      if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      //if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
       
       //bp->numOutliers=*(uint16_t*)(&inBuf[15]);
       //bitPos=17*8;
       bp->numOutliers=*(uint16_t*)(&inBuf[7]);
       bitPos=9*8;
-      if(D_R){printf("bp->numOutliers:%d\n",bp->numOutliers);} //DEBUG
+      //if(D_R){printf("bp->numOutliers:%d\n",bp->numOutliers);} //DEBUG
 
       bp->scalesBinSize=1/(double)(((uint64_t)1<<(bp->patternBits-1))-1);
   
       bp->binSize=p->usedEb*2;
       
-      if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+      //if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
 
       for(j=0;j<p->sbSize;j++){
         patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
-        if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+        //if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
       }
       for(j=0;j<p->sbNum;j++){
         scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
-        if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+        //if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
       }
       
       /* //Splitting
@@ -648,13 +648,13 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
       switch(bp->ECQBits){
         case 2:
           for(j=0;j<bp->numOutliers;j++){
-            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
-            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
             
             _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
             ECQTemp=readBits_I64(inBuf,&bitPos,1);
             ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
-            //if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+            ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
             //continue;
             //sb=_1DIdx/p->sbSize; 
             //localIdx=_1DIdx%p->sbSize;
@@ -662,32 +662,32 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
             ////data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
             ECQ[_1DIdx]=ECQTemp;
             
-            //if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
           }
           break;
         default: //bp->ECQBits>2
-          if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: bp->ECQBits:%d bp->numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+          //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: bp->ECQBits:%d bp->numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
     
           for(j=0;j<bp->numOutliers;j++){
             _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
             //sb=_1DIdx/p->sbSize; 
             //localIdx=_1DIdx%p->sbSize;
             temp=readBits_UI64(inBuf,&bitPos,1);
-            //if(DEBUG){printf("temp:%ld\n",temp);} //DEBUG
+            ////if(DEBUG){printf("temp:%ld\n",temp);} //DEBUG
             switch(temp){
               case 0:  //+-1
                 ECQTemp=readBits_I64(inBuf,&bitPos,1);
                 ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
-                //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
-                //if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
                 break;
               case 1: //Others
                 ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
-                //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
-                //if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
                 break;
               //default:
-              //  printf("ERROR: Bad 2-bit value: 0x%lx",temp);
+              ////  printf("ERROR: Bad 2-bit value: 0x%lx",temp);
               // assert(0); //AMG
               //  break;
             }
@@ -695,7 +695,7 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
             //data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
             ECQ[_1DIdx]=ECQTemp;
             
-            //if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
           }
           break;
       }
@@ -704,7 +704,7 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
       //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
       
       bytePos=(bitPos+7)/8;
-      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
       
       //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
       pastri_double_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
@@ -712,7 +712,7 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
       break;
     //R:CNonSparse
     case 3:
-      if(D_G){printf("\nDC:CNonSparse\n");} //DEBUG
+      //if(D_G){printf("\nDC:CNonSparse\n");} //DEBUG
       
       //for(j=0;j<p->bSize;j++){
       //  data[j]=0;
@@ -724,7 +724,7 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
       bp->patternBits=inBuf[5];
       bp->ECQBits=inBuf[6];
       
-      if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      //if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
       
       //bitPos=15*8;
       bitPos=7*8;
@@ -732,27 +732,27 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
       bp->scalesBinSize=1/(double)(((uint64_t)1<<(bp->patternBits-1))-1);
       bp->binSize=p->usedEb*2;
       
-      if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+      //if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
 
       for(j=0;j<p->sbSize;j++){
         patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
-        if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+        //if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
       }
       for(j=0;j<p->sbNum;j++){
         scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
-        if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+        //if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
       }
       /* //Splitting
       for(j=0;j<p->bSize;j++){
         data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
-        //if(DEBUG){printf("DC:PS[%d]=%.6e\n",j,data[j]);}
+        ////if(DEBUG){printf("DC:PS[%d]=%.6e\n",j,data[j]);}
       }
       */
       switch(bp->ECQBits){
         case 2:
           for(j=0;j<p->bSize;j++){
-            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
-            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
             //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
             temp=readBits_UI64(inBuf,&bitPos,1);
             switch(temp){
@@ -768,7 +768,7 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
                 break;
             }
             
-            //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
             //continue;
             //sb=_1DIdx/p->sbSize; 
             //localIdx=_1DIdx%p->sbSize;
@@ -776,39 +776,39 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
             //data[j]-=ECQTemp*bp->binSize; //Splitting
             ECQ[j]=ECQTemp;
             
-            //if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
           }
           break;
         default: //bp->ECQBits>2
-          //if(DEBUG)printf("AMG_R1:bitPos: %ld\n",bitPos);
+          ////if(DEBUG)printf("AMG_R1:bitPos: %ld\n",bitPos);
           
           for(j=0;j<p->bSize;j++){
-            //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
-            //if(DEBUG)printf("AMG_R2:bitPos: %ld\n",bitPos);
+            ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            ////if(DEBUG)printf("AMG_R2:bitPos: %ld\n",bitPos);
 
-            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
-            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
             //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
             temp=readBits_UI64(inBuf,&bitPos,1);
-            //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
             switch(temp){
               case 0:
-                //if(DEBUG)printf("Read:0");
+                ////if(DEBUG)printf("Read:0");
                 temp2=readBits_UI64(inBuf,&bitPos,1);
                 switch(temp2){
                   case 0:
-                    //if(DEBUG)printf("0");
+                    ////if(DEBUG)printf("0");
                     ECQTemp=readBits_I64(inBuf,&bitPos,1);
-                    //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
-                    //if(DEBUG)printf("R:ECQTemp:%ld\n",ECQTemp);
+                    ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    ////if(DEBUG)printf("R:ECQTemp:%ld\n",ECQTemp);
                     ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
-                    //if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
                     break;
                   case 1:
-                    //if(DEBUG)printf("1\n");
+                    ////if(DEBUG)printf("1\n");
                     ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
-                    //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
-                    //if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
                     break;
                   default:
                     assert(0);
@@ -816,16 +816,16 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
                 }
                 break;
               case 1:
-                //if(DEBUG)printf("Read:1\n");
+                ////if(DEBUG)printf("Read:1\n");
                 ECQTemp=0;
-                //if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
                 break;
               default:
                 assert(0);
                 break;
             }
             
-            //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
             //continue;
             //sb=_1DIdx/p->sbSize; 
             //localIdx=_1DIdx%p->sbSize;
@@ -833,7 +833,7 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
             //data[j]-=ECQTemp*bp->binSize; //Splitting
             ECQ[j]=ECQTemp;
             
-            //if(DEBUG){printf("DC:data[%d]:%.6e\n",j,data[j]);} //DEBUG
+            ////if(DEBUG){printf("DC:data[%d]:%.6e\n",j,data[j]);} //DEBUG
           }
           break;
       }
@@ -841,7 +841,7 @@ static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pas
       //patternQ=(int64_t*)(inBuf+15); 
       //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
       bytePos=(bitPos+7)/8;
-      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
       
       //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
       pastri_double_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
@@ -879,19 +879,19 @@ static inline int pastri_double_Check(unsigned char*inBuf,int dataSize,unsigned
   /*
   for(i=0;i<p->bSize;i++){
     if(idx0[i]!=idx0_dc[i]){
-      printf("idx0[%d]=%d  !=  %d=idx0_dc[%d]",i,idx0[i],idx0_dc[i],i);
+      //printf("idx0[%d]=%d  !=  %d=idx0_dc[%d]",i,idx0[i],idx0_dc[i],i);
       assert(0);
     }
     if(idx1[i]!=idx1_dc[i]){
-      printf("idx1[%d]=%d  !=  %d=idx1_dc[%d]",i,idx1[i],idx1_dc[i],i);
+      //printf("idx1[%d]=%d  !=  %d=idx1_dc[%d]",i,idx1[i],idx1_dc[i],i);
       assert(0);
     }
     if(idx2[i]!=idx2_dc[i]){
-      printf("idx2[%d]=%d  !=  %d=idx2_dc[%d]",i,idx2[i],idx2_dc[i],i);
+      //printf("idx2[%d]=%d  !=  %d=idx2_dc[%d]",i,idx2[i],idx2_dc[i],i);
       assert(0);
     }
     if(idx3[i]!=idx3_dc[i]){
-      printf("idx3[%d]=%d  !=  %d=idx3_dc[%d]",i,idx3[i],idx3_dc[i],i);
+      //printf("idx3[%d]=%d  !=  %d=idx3_dc[%d]",i,idx3[i],idx3_dc[i],i);
       assert(0);
     }
   }
@@ -900,7 +900,7 @@ static inline int pastri_double_Check(unsigned char*inBuf,int dataSize,unsigned
   //Comparing Data:
   for(i=0;i<p->bSize;i++){
     if(abs_FastD(data[i]-data_dc[i])>p->usedEb){
-      printf("|data[%d]-data_dc[%d]|>originalEb : %.3e - %.3e = %.3e > %.3e\n",i,i,data[i],data_dc[i],abs_FastD(data[i]-data_dc[i]),p->usedEb);
+      //printf("|data[%d]-data_dc[%d]|>originalEb : %.3e - %.3e = %.3e > %.3e\n",i,i,data[i],data_dc[i],abs_FastD(data[i]-data_dc[i]),p->usedEb);
       assert(0);
     }
   }
diff --git a/thirdparty/SZ/sz/include/pastriF.h b/thirdparty/SZ/sz/include/pastriF.h
index 08c9c140d80ee6d600234e3e99286d3e5c35c627..5c1d5879649e34636e20b383a4ac7bb818eea0e8 100644
--- a/thirdparty/SZ/sz/include/pastriF.h
+++ b/thirdparty/SZ/sz/include/pastriF.h
@@ -10,11 +10,11 @@ static inline int64_t pastri_float_quantize(float x, float binSize){
   
   half.d=0.5;
   
-  //printf("pastri_float_quantize:\nx=%lf  x=0x%lx\n",x,(*((uint64_t *)(&x))));
-  //printf("sign(x):0x%lx\n", x);
-  //printf("0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  ////printf("pastri_float_quantize:\nx=%lf  x=0x%lx\n",x,(*((uint64_t *)(&x))));
+  ////printf("sign(x):0x%lx\n", x);
+  ////printf("0.5:0x%lx\n", (*((uint64_t *)(&half))));
   half.ui64 |= (u1.ui64 & (uint64_t)0x8000000000000000);
-  //printf("sign(x)*0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  ////printf("sign(x)*0.5:0x%lx\n", (*((uint64_t *)(&half))));
   return (int64_t)(x + half.d);
 }
 
@@ -26,10 +26,10 @@ static inline void pastri_float_PatternMatch(float*data,pastri_params* p,pastri_
   bp->nonZeros=0;
   int i,sb;
   for(i=0;i<p->bSize;i++){
-    //printf("data[%d] = %.16lf\n",i,data[i]);//DEBUG
+    ////printf("data[%d] = %.16lf\n",i,data[i]);//DEBUG
     if(abs_FastD(data[i])>p->usedEb){
       bp->nonZeros++;
-      //if(DEBUG)printf("data[%d]:%.6e\n",i,data[i]); //DEBUG
+      ////if(DEBUG)printf("data[%d]:%.6e\n",i,data[i]); //DEBUG
     }
     if(abs_FastD(data[i])>absExt){
       absExt=abs_FastD(data[i]);
@@ -42,40 +42,40 @@ static inline void pastri_float_PatternMatch(float*data,pastri_params* p,pastri_
   float patternExt=data[extIdx];
   bp->binSize=2*p->usedEb;
   
-  //if(DEBUG){printf("Extremum  : data[%d] = %.6e\n",extIdx,patternExt);} //DEBUG
-  //if(DEBUG){printf("patternIdx: %d\n",patternIdx);} //DEBUG
+  ////if(DEBUG){printf("Extremum  : data[%d] = %.6e\n",extIdx,patternExt);} //DEBUG
+  ////if(DEBUG){printf("patternIdx: %d\n",patternIdx);} //DEBUG
   
-  //if(DEBUG){for(i=0;i<p->sbSize;i++){printf("pattern[%d]=data[%d]=%.6e Quantized:%d\n",i,patternIdx+i,data[patternIdx+i],pastri_float_quantize(data[patternIdx+i]/binSize)  );}   }//DEBUG
+  ////if(DEBUG){for(i=0;i<p->sbSize;i++){printf("pattern[%d]=data[%d]=%.6e Quantized:%d\n",i,patternIdx+i,data[patternIdx+i],pastri_float_quantize(data[patternIdx+i]/binSize)  );}   }//DEBUG
   
   //int64_t *patternQ=(int64_t*)(outBuf+15);  //Possible Improvement!
 
   
   for(i=0;i<p->sbSize;i++){
     patternQ[i]=pastri_float_quantize(data[patternIdx+i],bp->binSize);
-    if(D_W){printf("patternQ[%d]=%ld\n",i,patternQ[i]);}
+    //if(D_W){printf("patternQ[%d]=%ld\n",i,patternQ[i]);}
   }
   
   bp->patternBits=bitsNeeded_float((abs_FastD(patternExt)/bp->binSize)+1)+1;
   bp->scaleBits=bp->patternBits;
   bp->scalesBinSize=1/(float)(((uint64_t)1<<(bp->scaleBits-1))-1);
-  //if(DEBUG){printf("(patternExt/binSize)+1: %.6e\n",(patternExt/binSize)+1);} //DEBUG
-  //if(DEBUG){printf("scaleBits=patternBits: %d\n",scaleBits);} //DEBUG
-  if(D_W){printf("scalesBinSize: %.6e\n",bp->scalesBinSize);} //DEBUG
+  ////if(DEBUG){printf("(patternExt/binSize)+1: %.6e\n",(patternExt/binSize)+1);} //DEBUG
+  ////if(DEBUG){printf("scaleBits=patternBits: %d\n",scaleBits);} //DEBUG
+  //if(D_W){printf("scalesBinSize: %.6e\n",bp->scalesBinSize);} //DEBUG
   
   //Calculate Scales.
   //The index part of the input buffer will be reused to hold Scale, Pattern, etc. values.
   int localExtIdx=extIdx%p->sbSize; //Local extremum index. This is not the actual extremum of the current sb, but rather the index that correspond to the global (block) extremum.
   //int64_t *scalesQ=(int64_t*)(outBuf+15+p->sbSize*8);  //Possible Improvement!
   int patternExtZero=(patternExt==0);
-  //if(DEBUG){printf("patternExtZero: %d\n",patternExtZero);} //DEBUG
+  ////if(DEBUG){printf("patternExtZero: %d\n",patternExtZero);} //DEBUG
   for(sb=0;sb<p->sbNum;sb++){
     //scales[sb]=data[sb*p->sbSize+localExtIdx]/patternExt;
     //scales[sb]=patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt;
     //assert(scales[sb]<=1);
     scalesQ[sb]=pastri_float_quantize((patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt),bp->scalesBinSize);
-    if(D_W){printf("scalesQ[%d]=%ld\n",sb,scalesQ[sb]);}
+    //if(D_W){printf("scalesQ[%d]=%ld\n",sb,scalesQ[sb]);}
   }
-  //if(DEBUG){for(i=0;i<p->sbSize;i++){printf("scalesQ[%d]=%ld \n",i,scalesQ[i]);}} //DEBUG
+  ////if(DEBUG){for(i=0;i<p->sbSize;i++){printf("scalesQ[%d]=%ld \n",i,scalesQ[i]);}} //DEBUG
 
   //int64_t *ECQ=(int64_t*)(outBuf+p->bSize*8); //ECQ is written into outBuf, just be careful when handling it.
 
@@ -92,7 +92,7 @@ static inline void pastri_float_PatternMatch(float*data,pastri_params* p,pastri_
       float absECQ=abs_FastD(ECQ[_1DIdx]);
       if(absECQ > bp->ECQExt)
         bp->ECQExt=absECQ;
-      //if(DEBUG){printf("EC[%d]: %.6e Quantized:%ld \n",_1DIdx,(scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-data[_1DIdx]),ECQ[_1DIdx]);} //DEBUG
+      ////if(DEBUG){printf("EC[%d]: %.6e Quantized:%ld \n",_1DIdx,(scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-data[_1DIdx]),ECQ[_1DIdx]);} //DEBUG
       switch (ECQ[_1DIdx]){
         case 0:
           //ECQ0s++; //Currently not needed
@@ -117,8 +117,8 @@ static inline void pastri_float_PatternMatch(float*data,pastri_params* p,pastri_
       _1DIdx=sb*p->sbSize+i;
       float decompressed=scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-ECQ[_1DIdx]*binSize;
       if(abs_FastD(decompressed-data[_1DIdx])>(p->usedEb)){
-        printf("p->usedEb=%.6e\n",p->usedEb);
-        printf("data[%d]=%.6e decompressed[%d]=%.6e diff=%.6e\n",_1DIdx,data[_1DIdx],_1DIdx,decompressed,abs_FastD(data[_1DIdx]-decompressed));
+        //printf("p->usedEb=%.6e\n",p->usedEb);
+        //printf("data[%d]=%.6e decompressed[%d]=%.6e diff=%.6e\n",_1DIdx,data[_1DIdx],_1DIdx,decompressed,abs_FastD(data[_1DIdx]-decompressed));
         assert(0);
       }
     }
@@ -174,8 +174,8 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
   //*(uint16_t*)(&outBuf[5])=p->idxOffset[2];
   //*(uint16_t*)(&outBuf[7])=p->idxOffset[3];
     
-  if(D_W){printf("ECQ0s:%d ECQ1s:%d ECQOthers:%d Total:%d\n",p->bSize-bp->ECQ1s-bp->ECQOthers,bp->ECQ1s,bp->ECQOthers,p->bSize);} //DEBUG
-  if(D_W){printf("numOutliers:%d\n",bp->numOutliers);} //DEBUG
+  //if(D_W){printf("ECQ0s:%d ECQ1s:%d ECQOthers:%d Total:%d\n",p->bSize-bp->ECQ1s-bp->ECQOthers,bp->ECQ1s,bp->ECQOthers,p->bSize);} //DEBUG
+  //if(D_W){printf("numOutliers:%d\n",bp->numOutliers);} //DEBUG
   
   //****************************************************************************************
   //if(0){ //DEBUG
@@ -183,8 +183,8 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
   if((UCSparseBytes<UCNonSparseBytes) && (UCSparseBytes<CSparseBytes) && (UCSparseBytes<CNonSparseBytes) ){ 
     //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, indexOffsets, nonZeros, indexes, data
     *numOutBytes=UCSparseBytes;
-    if(D_G){printf("UCSparse\n");} //DEBUG
-    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    //if(D_G){printf("UCSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
     outBuf[0]=0; //mode
     
     //*(uint16_t*)(&outBuf[9])=nonZeros;
@@ -216,7 +216,7 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
             }
           }
     
-    if(D_G)printf("UCSparseBytes:%d \n",UCSparseBytes); //DEBUG
+    //if(D_G)printf("UCSparseBytes:%d \n",UCSparseBytes); //DEBUG
     
   //****************************************************************************************
   //}else if(0){ //DEBUG
@@ -224,23 +224,23 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
   }else if((UCNonSparseBytes<UCSparseBytes) && (UCNonSparseBytes<CSparseBytes) && (UCNonSparseBytes<CNonSparseBytes) ){ 
     //Uncompressed, NonSparse bits. Includes: mode, indexOffsets, data
     *numOutBytes=UCNonSparseBytes;
-    if(D_G){printf("UCNonSparse\n");} //DEBUG
-    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    //if(D_G){printf("UCNonSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
     outBuf[0]=1; //mode
     
     //memcpy(&outBuf[9], &inBuf[p->bSize*8], UCNonSparseBytes-9);
     memcpy(&outBuf[1], data, p->bSize*p->dataSize);
     
-    if(D_G)printf("UCNonSparseBytes:%d \n",UCNonSparseBytes); //DEBUG
+    //if(D_G)printf("UCNonSparseBytes:%d \n",UCNonSparseBytes); //DEBUG
     /*
     for(i=0;i<UCNonSparseBytes-17;i++){
-      printf("%d ",inBuf[p->bSize*8+i]);
+      //printf("%d ",inBuf[p->bSize*8+i]);
     }
-    printf("\n");
+    //printf("\n");
     for(i=0;i<UCNonSparseBytes-17;i++){
-      printf("%d ",outBuf[17+i]);
+      //printf("%d ",outBuf[17+i]);
     }
-    printf("\n");
+    //printf("\n");
     */
   //****************************************************************************************
   //}else if(1){ //DEBUG
@@ -248,9 +248,9 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
   }else if((CSparseBytes<UCNonSparseBytes) && (CSparseBytes<UCSparseBytes) && (CSparseBytes<CNonSparseBytes) ){ 
     //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
     *numOutBytes=CSparseBytes;
-    if(D_G){printf("CSparse\n");} //DEBUG
-    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
-    //if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    //if(D_G){printf("CSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    ////if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
     outBuf[0]=2; //mode
     
     ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
@@ -269,17 +269,17 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
     //Now, we are at the end of 9th byte.
     bitPos=9*8; 
     
-    //if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
 
     for(i=0;i<p->sbSize;i++){
       writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
     }
-    //if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
     for(i=0;i<p->sbNum;i++){
       writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
     }
-    //if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
-    //if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    ////if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG)printf("ECQBits:%d\n",ECQBits);
     switch(bp->ECQBits){
       case 2:
         for(i=0;i<p->bSize;i++){
@@ -287,7 +287,7 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
             case 0:
               break;
             case 1:
-              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x0\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x0\n",i,ECQ[i]); //DEBUG
               writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
               //writeBits_Fast(outBuf,&bitPos,2,0x10);
               //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
@@ -295,7 +295,7 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
               writeBits_Fast(outBuf,&bitPos,1,0);//0x00
               break;
             case -1:
-              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
               writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
               //writeBits_Fast(outBuf,&bitPos,2,0x11);
               //writeBits_Fast(outBuf,&bitPos,2,1);//0x01
@@ -314,7 +314,7 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
           case 0:
             break;
           case 1:
-            //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x00\n",i,ECQ[i]); //DEBUG
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x00\n",i,ECQ[i]); //DEBUG
             writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
             //writeBits_Fast(outBuf,&bitPos,3,0);//0x000
             //writeBits_Fast(outBuf,&bitPos,1,0);
@@ -322,7 +322,7 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
             writeBits_Fast(outBuf,&bitPos,1,0);
             break;
           case -1:
-            //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01\n",i,ECQ[i]); //DEBUG
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01\n",i,ECQ[i]); //DEBUG
             writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
             //writeBits_Fast(outBuf,&bitPos,3,1);//0x001
             //writeBits_Fast(outBuf,&bitPos,1,0);
@@ -330,7 +330,7 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
             writeBits_Fast(outBuf,&bitPos,1,1);
             break;
           default:
-            //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1 0x%lx\n",i,ECQ[i],ECQ[i]); //DEBUG
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1 0x%lx\n",i,ECQ[i],ECQ[i]); //DEBUG
             writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
             //writeBits_Fast(outBuf,&bitPos,2+ECQBits,((uint64_t)0x11<<ECQBits)|ECQ[i]);
             //writeBits_Fast(outBuf,&bitPos,2+ECQBits,(ECQ[i]&((uint64_t)0x00<<ECQBits))|((uint64_t)0x01<<ECQBits));
@@ -343,15 +343,15 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
       break;
     }
     
-    //if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
-    if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    ////if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
           
 
     uint32_t bytePos=(bitPos+7)/8;
     //*(uint32_t*)(&outBuf[9])=bytePos;
     *(uint32_t*)(&outBuf[1])=bytePos;
     
-    if(D_G)printf("bitPos:%ld CSparseBits:%d bytePos:%d CSparseBytes:%d\n",bitPos,CSparseBits,bytePos,CSparseBytes); //DEBUG
+    //if(D_G)printf("bitPos:%ld CSparseBits:%d bytePos:%d CSparseBytes:%d\n",bitPos,CSparseBits,bytePos,CSparseBytes); //DEBUG
     if(D_G){assert(bitPos==CSparseBits);}
     
   //****************************************************************************************
@@ -359,9 +359,9 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
   }else { 
     //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
     *numOutBytes=CNonSparseBytes;
-    if(D_G){printf("CNonSparse\n");} //DEBUG
-    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
-    //if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    //if(D_G){printf("CNonSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    ////if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
     outBuf[0]=3; //mode
     
     ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
@@ -374,33 +374,33 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
     outBuf[6]=bp->ECQBits;
     bitPos=7*8; //Currently, we are at the end of 7th byte.
     
-    //if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
 
     for(i=0;i<p->sbSize;i++){
       writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
     }
-    //if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
     for(i=0;i<p->sbNum;i++){
       writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
     }
-    //if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
-    //if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    ////if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG)printf("ECQBits:%d\n",ECQBits);
     switch(bp->ECQBits){
       case 2:
         for(i=0;i<p->bSize;i++){
           switch(ECQ[i]){
             case 0:
-              //if(DEBUG)printf("Index:%d ECQ:%d Written:0x1\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x1\n",i,ECQ[i]); //DEBUG
               writeBits_Fast(outBuf,&bitPos,1,1);//0x1
               break;
             case 1:
-              //if(DEBUG)printf("Index:%d ECQ:%d Written:0x00\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x00\n",i,ECQ[i]); //DEBUG
               //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
               writeBits_Fast(outBuf,&bitPos,1,0);
               writeBits_Fast(outBuf,&bitPos,1,0);
               break;
             case -1:
-              //if(DEBUG)printf("Index:%d ECQ:%d Written:0x01\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x01\n",i,ECQ[i]); //DEBUG
               //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
               writeBits_Fast(outBuf,&bitPos,1,0);
               writeBits_Fast(outBuf,&bitPos,1,1);
@@ -412,60 +412,60 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
         }
         break;
       default: //ECQBits>2
-        //if(DEBUG) printf("AMG_W1:bitPos:%ld\n",bitPos); //DEBUG
+        ////if(DEBUG) printf("AMG_W1:bitPos:%ld\n",bitPos); //DEBUG
         for(i=0;i<p->bSize;i++){
-          //if(DEBUG){printf("AMG_W3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
-          //if(DEBUG) printf("AMG_W2:bitPos:%ld\n",bitPos); //DEBUG
-          //if(DEBUG) printf("ECQ[%d]:%ld\n",i,ECQ[i]); //DEBUG
+          ////if(DEBUG){printf("AMG_W3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+          ////if(DEBUG) printf("AMG_W2:bitPos:%ld\n",bitPos); //DEBUG
+          ////if(DEBUG) printf("ECQ[%d]:%ld\n",i,ECQ[i]); //DEBUG
           switch(ECQ[i]){
             case 0:
-              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
-              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
               //temp1=bitPos;
               writeBits_Fast(outBuf,&bitPos,1,1);  //0x1
               //wVal=1; writeBits_Fast(outBuf,&bitPos,1,wVal); //0x1
-              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
               break;
             case 1:
-              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x000\n",i,ECQ[i]); //DEBUG
-              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x000\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
               //temp1=bitPos;
               //writeBits_Fast(outBuf,&bitPos,3,0); //0x000
               writeBits_Fast(outBuf,&bitPos,1,0);
               writeBits_Fast(outBuf,&bitPos,1,0);
               writeBits_Fast(outBuf,&bitPos,1,0);
               //wVal=0; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x000
-              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
               break;
             case -1:
-              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x001\n",i,ECQ[i]); //DEBUG
-              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x001\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
               //temp1=bitPos;
               //writeBits_Fast(outBuf,&bitPos,3,8); //0x001
               writeBits_Fast(outBuf,&bitPos,1,0); 
               writeBits_Fast(outBuf,&bitPos,1,0); 
               writeBits_Fast(outBuf,&bitPos,1,1); 
               //wVal=8; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x001
-              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
               break;
             default:
-              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01 0x%lx\n",i,ECQ[i]); //DEBUG
-              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01 0x%lx\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
               //temp1=bitPos;
               //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
               writeBits_Fast(outBuf,&bitPos,1,0); 
               writeBits_Fast(outBuf,&bitPos,1,1); 
               //wVal=2; writeBits_Fast(outBuf,&bitPos,2,wVal); //0x01
               writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
-              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
               break;
           }
         }
         break;
     }
     
-    //if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
-    if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    ////if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
     
           
 
@@ -473,20 +473,20 @@ static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* sc
     //*(uint32_t*)(&outBuf[9])=bytePos;
     *(uint32_t*)(&outBuf[1])=bytePos;
     
-    if(D_G)printf("bitPos:%ld CNonSparseBits:%d bytePos:%d CNonSparseBytes:%d\n",bitPos,CNonSparseBits,bytePos,CNonSparseBytes); //DEBUG
+    //if(D_G)printf("bitPos:%ld CNonSparseBits:%d bytePos:%d CNonSparseBytes:%d\n",bitPos,CNonSparseBits,bytePos,CNonSparseBytes); //DEBUG
     if(D_G){assert(bitPos==CNonSparseBits);}
     
   }
-  //for(i=213;i<233;i++)if(DEBUG)printf("AMG_WE:bitPos:%d buffer[%d]=0x%lx\n",i*8,i,*(uint64_t*)(&outBuf[i])); //DEBUG
+  ////for(i=213;i<233;i++)if(DEBUG)printf("AMG_WE:bitPos:%d buffer[%d]=0x%lx\n",i*8,i,*(uint64_t*)(&outBuf[i])); //DEBUG
   
 }
 static inline int pastri_float_Compress(unsigned char*inBuf,pastri_params *p,unsigned char*outBuf,int *numOutBytes){
   pastri_blockParams bp;
 
-  if(D_G2){printf("Parameters: dataSize:%d\n",p->dataSize);}  //DEBUG
-  if(D_G2){printf("Parameters: bfs:%d %d %d %d originalEb:%.3e\n",p->bf[0],p->bf[1],p->bf[2],p->bf[3],p->usedEb);}  //DEBUG
-  if(D_G2){printf("Parameters: idxRanges:%d %d %d %d\n",p->idxRange[0],p->idxRange[1],p->idxRange[2],p->idxRange[3]);} //DEBUG
-  if(D_G2){printf("Parameters: sbSize:%d sbNum:%d bSize:%d\n",p->sbSize,p->sbNum,p->bSize); }//DEBUG
+  //if(D_G2){printf("Parameters: dataSize:%d\n",p->dataSize);}  //DEBUG
+  //if(D_G2){printf("Parameters: bfs:%d %d %d %d originalEb:%.3e\n",p->bf[0],p->bf[1],p->bf[2],p->bf[3],p->usedEb);}  //DEBUG
+  //if(D_G2){printf("Parameters: idxRanges:%d %d %d %d\n",p->idxRange[0],p->idxRange[1],p->idxRange[2],p->idxRange[3]);} //DEBUG
+  //if(D_G2){printf("Parameters: sbSize:%d sbNum:%d bSize:%d\n",p->sbSize,p->sbNum,p->bSize); }//DEBUG
   
   int64_t patternQ[MAX_PS_SIZE];
   int64_t scalesQ[MAX_PS_SIZE];
@@ -566,7 +566,7 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
   switch(inBuf[0]){
     //R:UCSparse
     case 0:
-      if(D_G){printf("\nDC:UCSparse\n");} //DEBUG
+      //if(D_G){printf("\nDC:UCSparse\n");} //DEBUG
       //bp->nonZeros=*(uint16_t*)(&inBuf[9]);
       //bytePos=11;
       bp->nonZeros=*(uint16_t*)(&inBuf[1]);
@@ -591,19 +591,19 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
         data[_1DIdx]=*(float*)(&inBuf[bytePos]);
         bytePos+=8; 
       }
-      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
       break;
     //R:UCNonSparse
     case 1:
-      if(D_G){printf("\nDC:UCNonSparse\n");} //DEBUG
+      //if(D_G){printf("\nDC:UCNonSparse\n");} //DEBUG
       //memcpy(&outBuf[p->bSize*8], &inBuf[9], p->bSize*8);
       memcpy(data, &inBuf[1], p->bSize*8);
       bytePos=p->bSize*8;
-      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
       break;
     //R:CSparse
     case 2:
-      if(D_G){printf("\nDC:CSparse\n");} //DEBUG
+      //if(D_G){printf("\nDC:CSparse\n");} //DEBUG
       //for(j=0;j<p->bSize;j++){
       //  data[j]=0;
       //}
@@ -614,27 +614,27 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
       bp->patternBits=inBuf[5];
       bp->ECQBits=inBuf[6];
       
-      if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      //if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
       
       //bp->numOutliers=*(uint16_t*)(&inBuf[15]);
       //bitPos=17*8;
       bp->numOutliers=*(uint16_t*)(&inBuf[7]);
       bitPos=9*8;
-      if(D_R){printf("bp->numOutliers:%d\n",bp->numOutliers);} //DEBUG
+      //if(D_R){printf("bp->numOutliers:%d\n",bp->numOutliers);} //DEBUG
 
       bp->scalesBinSize=1/(float)(((uint64_t)1<<(bp->patternBits-1))-1);
   
       bp->binSize=p->usedEb*2;
       
-      if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+      //if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
 
       for(j=0;j<p->sbSize;j++){
         patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
-        if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+        //if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
       }
       for(j=0;j<p->sbNum;j++){
         scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
-        if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+        //if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
       }
       
       /* //Splitting
@@ -648,13 +648,13 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
       switch(bp->ECQBits){
         case 2:
           for(j=0;j<bp->numOutliers;j++){
-            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
-            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
             
             _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
             ECQTemp=readBits_I64(inBuf,&bitPos,1);
             ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
-            //if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+            ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
             //continue;
             //sb=_1DIdx/p->sbSize; 
             //localIdx=_1DIdx%p->sbSize;
@@ -662,32 +662,32 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
             ////data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
             ECQ[_1DIdx]=ECQTemp;
             
-            //if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
           }
           break;
         default: //bp->ECQBits>2
-          if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: bp->ECQBits:%d bp->numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+          //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: bp->ECQBits:%d bp->numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
     
           for(j=0;j<bp->numOutliers;j++){
             _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
             //sb=_1DIdx/p->sbSize; 
             //localIdx=_1DIdx%p->sbSize;
             temp=readBits_UI64(inBuf,&bitPos,1);
-            //if(DEBUG){printf("temp:%ld\n",temp);} //DEBUG
+            ////if(DEBUG){printf("temp:%ld\n",temp);} //DEBUG
             switch(temp){
               case 0:  //+-1
                 ECQTemp=readBits_I64(inBuf,&bitPos,1);
                 ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
-                //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
-                //if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
                 break;
               case 1: //Others
                 ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
-                //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
-                //if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
                 break;
               //default:
-              //  printf("ERROR: Bad 2-bit value: 0x%lx",temp);
+              ////  printf("ERROR: Bad 2-bit value: 0x%lx",temp);
               // assert(0); //AMG
               //  break;
             }
@@ -695,7 +695,7 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
             //data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
             ECQ[_1DIdx]=ECQTemp;
             
-            //if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
           }
           break;
       }
@@ -704,7 +704,7 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
       //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
       
       bytePos=(bitPos+7)/8;
-      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
       
       //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
       pastri_float_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
@@ -712,7 +712,7 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
       break;
     //R:CNonSparse
     case 3:
-      if(D_G){printf("\nDC:CNonSparse\n");} //DEBUG
+      //if(D_G){printf("\nDC:CNonSparse\n");} //DEBUG
       
       //for(j=0;j<p->bSize;j++){
       //  data[j]=0;
@@ -724,7 +724,7 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
       bp->patternBits=inBuf[5];
       bp->ECQBits=inBuf[6];
       
-      if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      //if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
       
       //bitPos=15*8;
       bitPos=7*8;
@@ -732,27 +732,27 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
       bp->scalesBinSize=1/(float)(((uint64_t)1<<(bp->patternBits-1))-1);
       bp->binSize=p->usedEb*2;
       
-      if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+      //if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
 
       for(j=0;j<p->sbSize;j++){
         patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
-        if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+        //if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
       }
       for(j=0;j<p->sbNum;j++){
         scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
-        if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+        //if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
       }
       /* //Splitting
       for(j=0;j<p->bSize;j++){
         data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
-        //if(DEBUG){printf("DC:PS[%d]=%.6e\n",j,data[j]);}
+        ////if(DEBUG){printf("DC:PS[%d]=%.6e\n",j,data[j]);}
       }
       */
       switch(bp->ECQBits){
         case 2:
           for(j=0;j<p->bSize;j++){
-            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
-            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
             //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
             temp=readBits_UI64(inBuf,&bitPos,1);
             switch(temp){
@@ -768,7 +768,7 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
                 break;
             }
             
-            //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
             //continue;
             //sb=_1DIdx/p->sbSize; 
             //localIdx=_1DIdx%p->sbSize;
@@ -776,39 +776,39 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
             //data[j]-=ECQTemp*bp->binSize; //Splitting
             ECQ[j]=ECQTemp;
             
-            //if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
           }
           break;
         default: //bp->ECQBits>2
-          //if(DEBUG)printf("AMG_R1:bitPos: %ld\n",bitPos);
+          ////if(DEBUG)printf("AMG_R1:bitPos: %ld\n",bitPos);
           
           for(j=0;j<p->bSize;j++){
-            //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
-            //if(DEBUG)printf("AMG_R2:bitPos: %ld\n",bitPos);
+            ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            ////if(DEBUG)printf("AMG_R2:bitPos: %ld\n",bitPos);
 
-            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
-            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
             //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
             temp=readBits_UI64(inBuf,&bitPos,1);
-            //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
             switch(temp){
               case 0:
-                //if(DEBUG)printf("Read:0");
+                ////if(DEBUG)printf("Read:0");
                 temp2=readBits_UI64(inBuf,&bitPos,1);
                 switch(temp2){
                   case 0:
-                    //if(DEBUG)printf("0");
+                    ////if(DEBUG)printf("0");
                     ECQTemp=readBits_I64(inBuf,&bitPos,1);
-                    //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
-                    //if(DEBUG)printf("R:ECQTemp:%ld\n",ECQTemp);
+                    ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    ////if(DEBUG)printf("R:ECQTemp:%ld\n",ECQTemp);
                     ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
-                    //if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
                     break;
                   case 1:
-                    //if(DEBUG)printf("1\n");
+                    ////if(DEBUG)printf("1\n");
                     ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
-                    //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
-                    //if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
                     break;
                   default:
                     assert(0);
@@ -816,16 +816,16 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
                 }
                 break;
               case 1:
-                //if(DEBUG)printf("Read:1\n");
+                ////if(DEBUG)printf("Read:1\n");
                 ECQTemp=0;
-                //if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
                 break;
               default:
                 assert(0);
                 break;
             }
             
-            //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
             //continue;
             //sb=_1DIdx/p->sbSize; 
             //localIdx=_1DIdx%p->sbSize;
@@ -833,7 +833,7 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
             //data[j]-=ECQTemp*bp->binSize; //Splitting
             ECQ[j]=ECQTemp;
             
-            //if(DEBUG){printf("DC:data[%d]:%.6e\n",j,data[j]);} //DEBUG
+            ////if(DEBUG){printf("DC:data[%d]:%.6e\n",j,data[j]);} //DEBUG
           }
           break;
       }
@@ -841,7 +841,7 @@ static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,past
       //patternQ=(int64_t*)(inBuf+15); 
       //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
       bytePos=(bitPos+7)/8;
-      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
       
       //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
       pastri_float_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
@@ -879,19 +879,19 @@ static inline int pastri_float_Check(unsigned char*inBuf,int dataSize,unsigned c
   /*
   for(i=0;i<p->bSize;i++){
     if(idx0[i]!=idx0_dc[i]){
-      printf("idx0[%d]=%d  !=  %d=idx0_dc[%d]",i,idx0[i],idx0_dc[i],i);
+      //printf("idx0[%d]=%d  !=  %d=idx0_dc[%d]",i,idx0[i],idx0_dc[i],i);
       assert(0);
     }
     if(idx1[i]!=idx1_dc[i]){
-      printf("idx1[%d]=%d  !=  %d=idx1_dc[%d]",i,idx1[i],idx1_dc[i],i);
+      //printf("idx1[%d]=%d  !=  %d=idx1_dc[%d]",i,idx1[i],idx1_dc[i],i);
       assert(0);
     }
     if(idx2[i]!=idx2_dc[i]){
-      printf("idx2[%d]=%d  !=  %d=idx2_dc[%d]",i,idx2[i],idx2_dc[i],i);
+      //printf("idx2[%d]=%d  !=  %d=idx2_dc[%d]",i,idx2[i],idx2_dc[i],i);
       assert(0);
     }
     if(idx3[i]!=idx3_dc[i]){
-      printf("idx3[%d]=%d  !=  %d=idx3_dc[%d]",i,idx3[i],idx3_dc[i],i);
+      //printf("idx3[%d]=%d  !=  %d=idx3_dc[%d]",i,idx3[i],idx3_dc[i],i);
       assert(0);
     }
   }
@@ -900,7 +900,7 @@ static inline int pastri_float_Check(unsigned char*inBuf,int dataSize,unsigned c
   //Comparing Data:
   for(i=0;i<p->bSize;i++){
     if(abs_FastD(data[i]-data_dc[i])>p->usedEb){
-      printf("|data[%d]-data_dc[%d]|>originalEb : %.3e - %.3e = %.3e > %.3e\n",i,i,data[i],data_dc[i],abs_FastD(data[i]-data_dc[i]),p->usedEb);
+      //printf("|data[%d]-data_dc[%d]|>originalEb : %.3e - %.3e = %.3e > %.3e\n",i,i,data[i],data_dc[i],abs_FastD(data[i]-data_dc[i]),p->usedEb);
       assert(0);
     }
   }
diff --git a/thirdparty/SZ/sz/include/sz.h b/thirdparty/SZ/sz/include/sz.h
index 31c326091118500114a3299759b6387fb6f2702e..42f1fc8aabbdb3d604d31490cc7600034de7ac55 100644
--- a/thirdparty/SZ/sz/include/sz.h
+++ b/thirdparty/SZ/sz/include/sz.h
@@ -54,6 +54,7 @@
 #include "pastri.h"
 #include "sz_float_ts.h"
 #include "szd_float_ts.h"
+#include "utility.h"
 
 #ifdef _WIN32
 #define PATH_SEPARATOR ';'
@@ -74,11 +75,11 @@ extern "C" {
 //typedef long int64_t;
 //typedef unsigned long uint64_t;
 
-#define SZ_VERNUM 0x0140
-#define SZ_VER_MAJOR 1
-#define SZ_VER_MINOR 4
-#define SZ_VER_BUILD 13
-#define SZ_VER_REVISION 5
+#define SZ_VERNUM 0x0200
+#define SZ_VER_MAJOR 2
+#define SZ_VER_MINOR 0
+#define SZ_VER_BUILD 2
+#define SZ_VER_REVISION 0
 
 #define PASTRI 103
 #define HZ 102
@@ -130,6 +131,9 @@ extern "C" {
 #define SZ_DEFAULT_COMPRESSION 2
 #define SZ_TEMPORAL_COMPRESSION 3
 
+#define SZ_NO_REGRESSION 0
+#define SZ_WITH_LINEAR_REGRESSION 1
+
 #define SZ_PWR_MIN_TYPE 0
 #define SZ_PWR_AVG_TYPE 1
 #define SZ_PWR_MAX_TYPE 2
@@ -151,6 +155,10 @@ extern "C" {
 #define MetaDataByteLength 20	
 	
 #define numOfBufferedSteps 1 //the number of time steps in the buffer	
+
+
+#define GZIP_COMPRESSOR 0 //i.e., ZLIB_COMPRSSOR
+#define ZSTD_COMPRESSOR 1
 	
 //Note: the following setting should be consistent with stateNum in Huffman.h
 //#define intvCapacity 65536
@@ -236,6 +244,7 @@ typedef struct sz_params
 	unsigned int quantization_intervals; 
 	unsigned int maxRangeRadius;
 	int sol_ID;// it's always SZ, unless the setting is PASTRI compression mode (./configure --enable-pastri)
+	int losslessCompressor;
 	int sampleDistance; //2 bytes
 	float predThreshold;  // 2 bytes
 	int szMode; //* 0 (best speed) or 1 (better compression with Gzip) or 3 temporal-dimension based compression
@@ -278,6 +287,10 @@ typedef struct sz_tsc_metainfo
 	int currentStep;
 	char metadata_filename[256];
 	FILE *metadata_file;
+	unsigned char* bit_array; //sihuan added
+	size_t intersect_size; //sihuan added
+	int64_t* hist_index; //sihuan added: prestep index 
+
 } sz_tsc_metadata;
 
 extern int versionNumber[4];
@@ -289,6 +302,8 @@ extern int sysEndianType; //*sysEndianType is actually set automatically.
 extern sz_params *confparams_cpr;
 extern sz_params *confparams_dec;
 extern sz_exedata *exe_params;
+extern int sz_with_regression;
+
 //------------------------------------------------
 extern SZ_VarSet* sz_varset;
 extern sz_multisteps *multisteps; //compression based on multiple time steps (time-dimension based compression)
@@ -356,8 +371,6 @@ void filloutDimArray(size_t* dim, size_t r5, size_t r4, size_t r3, size_t r2, si
 
 size_t compute_total_batch_size();
 
-int isZlibFormat(unsigned char magic1, unsigned char magic2);
-
 void SZ_registerVar(char* varName, int dataType, void* data, 
 			int errBoundMode, double absErrBound, double relBoundRatio, double pwRelBoundRatio, 
 			size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
diff --git a/thirdparty/SZ/sz/include/sz_double.h b/thirdparty/SZ/sz/include/sz_double.h
index b186d12d0afee51724debbcbf5ad40e1183bc361..99623661da4e4ba72190430694ab64f687bd062c 100644
--- a/thirdparty/SZ/sz/include/sz_double.h
+++ b/thirdparty/SZ/sz/include/sz_double.h
@@ -75,6 +75,12 @@ size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, siz
 TightDataPointStorageD* SZ_compress_double_4D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
 size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
 
+unsigned int optimize_intervals_double_2D_with_freq_and_dense_pos(double *oriData, size_t r1, size_t r2, double realPrecision, double * dense_pos, double * max_freq, double * mean_freq);
+unsigned int optimize_intervals_double_3D_with_freq_and_dense_pos(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, double * dense_pos, double * max_freq, double * mean_freq);
+unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(double *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/thirdparty/SZ/sz/include/sz_double_pwr.h b/thirdparty/SZ/sz/include/sz_double_pwr.h
index ce81629d089d14b567ba4a6ab3da561cb6ece624..9cb9978cc57a7e5feb427fccbeb804878b706c23 100644
--- a/thirdparty/SZ/sz/include/sz_double_pwr.h
+++ b/thirdparty/SZ/sz/include/sz_double_pwr.h
@@ -37,6 +37,10 @@ double absErrBound, double relBoundRatio, double pwrErrRatio, double valueRangeS
 void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(unsigned char** newByteData, double *oriData,
 size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio, double valueRangeSize, double medianValue_f, size_t *outSize);
 
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_pre_log(unsigned char** newByteData, double *oriData, double globalPrecision, size_t dataLength, size_t *outSize, double min, double max);
+void SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr_pre_log(unsigned char** newByteData, double *oriData, double globalPrecision, size_t r1, size_t r2, size_t *outSize, double min, double max);
+void SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log(unsigned char** newByteData, double *oriData, double globalPrecision, size_t r1, size_t r2, size_t r3, size_t *outSize, double min, double max);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/thirdparty/SZ/sz/include/sz_float.h b/thirdparty/SZ/sz/include/sz_float.h
index 6ab92319f5c5aa0c21fdf1ba492611aeeeea1ea0..d08827c472229fb7ab5780c5581aa859a325a0b7 100644
--- a/thirdparty/SZ/sz/include/sz_float.h
+++ b/thirdparty/SZ/sz/include/sz_float.h
@@ -128,6 +128,14 @@ size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, siz
 TightDataPointStorageF* SZ_compress_float_4D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
 size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
 
+
+unsigned int optimize_intervals_float_2D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq);
+unsigned int optimize_intervals_float_3D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq);
+
+unsigned char * SZ_compress_float_2D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_random_access_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/thirdparty/SZ/sz/include/sz_float_pwr.h b/thirdparty/SZ/sz/include/sz_float_pwr.h
index 3ef0e7cd858869a7328f4140d37a469c86a0757f..a2432b326a4a2f7f97f512eca2528818a00bca4d 100644
--- a/thirdparty/SZ/sz/include/sz_float_pwr.h
+++ b/thirdparty/SZ/sz/include/sz_float_pwr.h
@@ -44,6 +44,10 @@ double absErrBound, double relBoundRatio, double pwrErrRatio, float valueRangeSi
 void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(unsigned char** newByteData, float *oriData,
 size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio, float valueRangeSize, float medianValue_f, size_t *outSize);
 
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, float min, float max);
+void SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t *outSize, float min, float max);
+void SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t r3, size_t *outSize, float min, float max);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/thirdparty/SZ/sz/include/szd_double.h b/thirdparty/SZ/sz/include/szd_double.h
index daf3622de3f45e8a31c84b2d6191db6acf2a709b..15bb81c92e8efa9592cc5e9e05316d4efd8268d4 100644
--- a/thirdparty/SZ/sz/include/szd_double.h
+++ b/thirdparty/SZ/sz/include/szd_double.h
@@ -24,6 +24,8 @@ void getSnapshotData_double_1D(double** data, size_t dataSeriesLength, TightData
 void getSnapshotData_double_2D(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps, int errBoundMode);
 void getSnapshotData_double_3D(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps, int errBoundMode);
 void getSnapshotData_double_4D(double** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageD* tdps, int errBoundMode);
+void decompressDataSeries_double_2D_nonblocked_with_blocked_regression(double** data, size_t r1, size_t r2, unsigned char* comp_data);
+void decompressDataSeries_double_3D_nonblocked_with_blocked_regression(double** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
 
 int SZ_decompress_args_double(double** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
 
diff --git a/thirdparty/SZ/sz/include/szd_double_pwr.h b/thirdparty/SZ/sz/include/szd_double_pwr.h
index 5d3b257b93bd3e843d637d4f84fa9da9ee3a2a1b..2756685f5b3116fb2147d6202a719a88600d1523 100644
--- a/thirdparty/SZ/sz/include/szd_double_pwr.h
+++ b/thirdparty/SZ/sz/include/szd_double_pwr.h
@@ -21,6 +21,10 @@ double* extractRealPrecision_3D_double(size_t R1, size_t R2, size_t R3, int bloc
 void decompressDataSeries_double_3D_pwr(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps);
 
 void decompressDataSeries_double_1D_pwrgroup(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_1D_pwr_pre_log(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_2D_pwr_pre_log(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_3D_pwr_pre_log(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/thirdparty/SZ/sz/include/szd_float.h b/thirdparty/SZ/sz/include/szd_float.h
index 8aaf42c95e5285f46d093fc66db09a3afb4afabf..b2168fe7ffdaff230de50dc0db3c26a5913e8564 100644
--- a/thirdparty/SZ/sz/include/szd_float.h
+++ b/thirdparty/SZ/sz/include/szd_float.h
@@ -31,6 +31,11 @@ size_t decompressDataSeries_float_2D_RA_block(float * data, float mean, size_t d
 int SZ_decompress_args_float(float** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
 
 size_t decompressDataSeries_float_3D_RA_block(float * data, float mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, int * type, float * unpredictable_data);
+
+void decompressDataSeries_float_2D_nonblocked_with_blocked_regression(float** data, size_t r1, size_t r2, unsigned char* comp_data);
+void decompressDataSeries_float_3D_nonblocked_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+void decompressDataSeries_float_3D_random_access_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/thirdparty/SZ/sz/include/szd_float_pwr.h b/thirdparty/SZ/sz/include/szd_float_pwr.h
index 0907517f732a3a20dd180d69d6f9818a6e64bb07..7f57b4597deebca30a58e05153a4fa5f89cbb5b3 100644
--- a/thirdparty/SZ/sz/include/szd_float_pwr.h
+++ b/thirdparty/SZ/sz/include/szd_float_pwr.h
@@ -22,6 +22,9 @@ void decompressDataSeries_float_3D_pwr(float** data, size_t r1, size_t r2, size_
 
 char* decompressGroupIDArray(unsigned char* bytes, size_t dataLength);
 void decompressDataSeries_float_1D_pwrgroup(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_1D_pwr_pre_log(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_2D_pwr_pre_log(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_3D_pwr_pre_log(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps);
 
 #ifdef __cplusplus
 }
diff --git a/thirdparty/SZ/sz/include/utility.h b/thirdparty/SZ/sz/include/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..b66c141871f6f95d2c62584eecc99784a46cebbb
--- /dev/null
+++ b/thirdparty/SZ/sz/include/utility.h
@@ -0,0 +1,43 @@
+/**
+ *  @file utility.h
+ *  @author Sheng Di, Sihuan Li
+ *  @date July, 2018
+ *  @brief Header file for the utility.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _UTILITY_H
+#define _UTILITY_H
+
+#include "sz.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//sihuan added: use a assistant struct to do sorting and swap that are easy to implement: should
+//consider optimizing the performance later.
+typedef struct sort_ast_particle{
+	int64_t id;
+	float var[6];
+} sort_ast_particle;
+
+int compare_struct(const void* obj1, const void* obj2);//sihuan added: the compare function in the qsort parameter for 2 structures
+void reorder_vars(SZ_VarSet* vset);//sihuan added: reorder the variables increasingly by their index
+size_t intersectAndsort(int64_t* preIndex, size_t preLen, SZ_VarSet* curVar, size_t dataLen, unsigned char* bitarray);
+//sihuan added: find intersection and keep new var sorted by id
+void write_reordered_tofile(SZ_VarSet* curVar, size_t dataLen);
+//sihuan added: write the reordered input to files for further decompression validation
+float calculate_delta_t(size_t size);//sihuan added
+
+int is_lossless_compressed_data(unsigned char* compressedBytes, size_t cmpSize);
+unsigned long sz_lossless_compress(int losslessCompressor, int level, unsigned char* data, unsigned long dataLength, unsigned char** compressBytes);
+unsigned long sz_lossless_decompress(int losslessCompressor, unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long sz_lossless_decompress65536bytes(int losslessCompressor, unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _UTILITY_H  ----- */
diff --git a/thirdparty/SZ/sz/src/ByteToolkit.c b/thirdparty/SZ/sz/src/ByteToolkit.c
index 31dbf66f9b1fba6ff623adccf0f47ad4401e5a5c..43fd8112830826f98689e899e141125714770be5 100644
--- a/thirdparty/SZ/sz/src/ByteToolkit.c
+++ b/thirdparty/SZ/sz/src/ByteToolkit.c
@@ -430,7 +430,7 @@ long bytesToLong(unsigned char* bytes)
 }
 
 //the byte to input is in the big-endian format
-float bytesToFloat(unsigned char* bytes)
+inline float bytesToFloat(unsigned char* bytes)
 {
 	lfloat buf;
 	memcpy(buf.byte, bytes, 4);
@@ -439,7 +439,7 @@ float bytesToFloat(unsigned char* bytes)
 	return buf.value;
 }
 
-void floatToBytes(unsigned char *b, float num)
+inline void floatToBytes(unsigned char *b, float num)
 {
 	lfloat buf;
 	buf.value = num;
@@ -449,7 +449,7 @@ void floatToBytes(unsigned char *b, float num)
 }
 
 //the byte to input is in the big-endian format
-double bytesToDouble(unsigned char* bytes)
+inline double bytesToDouble(unsigned char* bytes)
 {
 	ldouble buf;
 	memcpy(buf.byte, bytes, 8);
@@ -458,7 +458,7 @@ double bytesToDouble(unsigned char* bytes)
 	return buf.value;
 }
 
-void doubleToBytes(unsigned char *b, double num)
+inline void doubleToBytes(unsigned char *b, double num)
 {
 	ldouble buf;
 	buf.value = num;
@@ -507,7 +507,7 @@ int extractBytes(unsigned char* byteArray, size_t k, int validLength)
 	return result;
 }
 
-int getMaskRightCode(int m) {
+inline int getMaskRightCode(int m) {
 	switch (m) {
 	case 1:
 		return 0x01;
@@ -530,16 +530,16 @@ int getMaskRightCode(int m) {
 	}
 }
 
-int getLeftMovingCode(int kMod8)
+inline int getLeftMovingCode(int kMod8)
 {
 	return getMaskRightCode(8 - kMod8);
 }
 
-int getRightMovingSteps(int kMod8, int resiBitLength) {
+inline int getRightMovingSteps(int kMod8, int resiBitLength) {
 	return 8 - kMod8 - resiBitLength;
 }
 
-int getRightMovingCode(int kMod8, int resiBitLength)
+inline int getRightMovingCode(int kMod8, int resiBitLength)
 {
 	int rightMovingSteps = 8 - kMod8 - resiBitLength;
 	if(rightMovingSteps < 0)
@@ -814,7 +814,7 @@ void convertULongArrayToBytes(uint64_t* states, size_t stateLength, unsigned cha
 }
 
 
-size_t bytesToSize(unsigned char* bytes)
+inline size_t bytesToSize(unsigned char* bytes)
 {
 	size_t result = 0;
 	if(exe_params->SZ_SIZE_TYPE==4)	
@@ -824,7 +824,7 @@ size_t bytesToSize(unsigned char* bytes)
 	return result;
 }
 
-void sizeToBytes(unsigned char* outBytes, size_t size)
+inline void sizeToBytes(unsigned char* outBytes, size_t size)
 {
 	if(exe_params->SZ_SIZE_TYPE==4)
 		intToBytes_bigEndian(outBytes, size);//4
diff --git a/thirdparty/SZ/sz/src/DynamicDoubleArray.c b/thirdparty/SZ/sz/src/DynamicDoubleArray.c
index 20eb579d65d5462ebf9e80b8f8ee21ecca083883..54bbb109aaa500e6412357f5504e1616e76ed03f 100644
--- a/thirdparty/SZ/sz/src/DynamicDoubleArray.c
+++ b/thirdparty/SZ/sz/src/DynamicDoubleArray.c
@@ -21,7 +21,7 @@ void new_DDA(DynamicDoubleArray **dda, size_t cap) {
 
 void convertDDAtoDoubles(DynamicDoubleArray *dba, double **data)
 {
-	int size = dba->size;
+	size_t size = dba->size;
 	if(size>0)
 		*data = (double*)malloc(size * sizeof(double));
 	else
diff --git a/thirdparty/SZ/sz/src/DynamicFloatArray.c b/thirdparty/SZ/sz/src/DynamicFloatArray.c
index f775827a83610246c841cc0a284b35818ef7b525..1a80a4888f79998b706d318fadd79485a3f19ca4 100644
--- a/thirdparty/SZ/sz/src/DynamicFloatArray.c
+++ b/thirdparty/SZ/sz/src/DynamicFloatArray.c
@@ -21,7 +21,7 @@ void new_DFA(DynamicFloatArray **dfa, size_t cap) {
 
 void convertDFAtoFloats(DynamicFloatArray *dfa, float **data)
 {
-	int size = dfa->size;
+	size_t size = dfa->size;
 	if(size>0)
 		*data = (float*)malloc(size * sizeof(float));
 	else
diff --git a/thirdparty/SZ/sz/src/DynamicIntArray.c b/thirdparty/SZ/sz/src/DynamicIntArray.c
index 3196ab99134e632a855183cd982e31d4004785b8..9b713aad12091a17d8f1d01ea243aa9e3b094941 100644
--- a/thirdparty/SZ/sz/src/DynamicIntArray.c
+++ b/thirdparty/SZ/sz/src/DynamicIntArray.c
@@ -21,7 +21,7 @@ void new_DIA(DynamicIntArray **dia, size_t cap) {
 
 void convertDIAtoInts(DynamicIntArray *dia, unsigned char **data)
 {
-	int size = dia->size;
+	size_t size = dia->size;
 	if(size>0)
 		*data = (unsigned char*)malloc(size * sizeof(char));
 	else
diff --git a/thirdparty/SZ/sz/src/Huffman.c b/thirdparty/SZ/sz/src/Huffman.c
index d067609063c54bb297fbfac0df04280ccb43f4b5..6004090a891a5409fab9b82943bb2639c58b0768 100644
--- a/thirdparty/SZ/sz/src/Huffman.c
+++ b/thirdparty/SZ/sz/src/Huffman.c
@@ -651,23 +651,26 @@ node reconstruct_HuffTree_from_bytes_anyStates(HuffmanTree *huffmanTree, unsigne
 
 void encode_withTree(HuffmanTree* huffmanTree, int *s, size_t length, unsigned char **out, size_t *outSize)
 {
-	size_t i, nodeCount = 0;
+	size_t i; 
+	int nodeCount = 0;
 	unsigned char *treeBytes, buffer[4];
 	
 	init(huffmanTree, s, length);
 	for (i = 0; i < huffmanTree->stateNum; i++)
-		if (huffmanTree->code[i]) nodeCount++;
+		if (huffmanTree->code[i]) nodeCount++; 
 	nodeCount = nodeCount*2-1;
 	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree,nodeCount, &treeBytes);
 	//printf("treeByteSize=%d\n", treeByteSize);
 	*out = (unsigned char*)malloc(length*sizeof(int)+treeByteSize);
 	intToBytes_bigEndian(buffer, nodeCount);
 	memcpy(*out, buffer, 4);
-	memcpy(*out+4, treeBytes, treeByteSize);
+	intToBytes_bigEndian(buffer, huffmanTree->stateNum/2); //real number of intervals
+	memcpy(*out+4, buffer, 4);
+	memcpy(*out+8, treeBytes, treeByteSize);
 	free(treeBytes);
 	size_t enCodeSize = 0;
-	encode(huffmanTree, s, length, *out+4+treeByteSize, &enCodeSize);
-	*outSize = 4+treeByteSize+enCodeSize;
+	encode(huffmanTree, s, length, *out+8+treeByteSize, &enCodeSize);
+	*outSize = 8+treeByteSize+enCodeSize;
 	
 	//unsigned short state[length];
 	//decode(*out+4+treeByteSize, enCodeSize, qqq[0], state);
@@ -682,7 +685,7 @@ void decode_withTree(HuffmanTree* huffmanTree, unsigned char *s, size_t targetLe
 {
 	size_t encodeStartIndex;
 	size_t nodeCount = bytesToInt_bigEndian(s);
-	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,s+4, nodeCount);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,s+8, nodeCount);
 	
 	//sdi: Debug
 /*	build_code(root, 0, 0, 0);
@@ -701,7 +704,7 @@ void decode_withTree(HuffmanTree* huffmanTree, unsigned char *s, size_t targetLe
 		encodeStartIndex = 1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);
 	else
 		encodeStartIndex = 1+3*nodeCount*sizeof(unsigned int)+nodeCount*sizeof(unsigned char);
-	decode(s+4+encodeStartIndex, targetLength, root, out);
+	decode(s+8+encodeStartIndex, targetLength, root, out);
 }
 
 void SZ_ReleaseHuffman(HuffmanTree* huffmanTree)
diff --git a/thirdparty/SZ/sz/src/TightDataPointStorageD.c b/thirdparty/SZ/sz/src/TightDataPointStorageD.c
index 6ece9dbb56dfaa538d9796b9ab16f998d72bc474..f31bf856d28669cdfa2338761ec55d294bfc5531 100644
--- a/thirdparty/SZ/sz/src/TightDataPointStorageD.c
+++ b/thirdparty/SZ/sz/src/TightDataPointStorageD.c
@@ -46,6 +46,10 @@ void new_TightDataPointStorageD_Empty(TightDataPointStorageD **this)
 	(*this)->segment_size = 0;
 	(*this)->pwrErrBoundBytes = NULL;
 	(*this)->pwrErrBoundBytes_size = 0;
+	
+	(*this)->raBytes = NULL;
+	(*this)->raBytes_size = 0;
+
 }
 
 int new_TightDataPointStorageD_fromFlatBytes(TightDataPointStorageD **this, unsigned char* flatBytes, size_t flatBytesLength)
@@ -83,10 +87,13 @@ int new_TightDataPointStorageD_fromFlatBytes(TightDataPointStorageD **this, unsi
 	sz_params* params = convertBytesToSZParams(&(flatBytes[index]));
 	int mode = confparams_dec->szMode;
 	int predictionMode = confparams_dec->predictionMode;
+	int losslessCompressor = confparams_dec->losslessCompressor;
 	if(confparams_dec!=NULL)
 		free(confparams_dec);
 	confparams_dec = params;
 	confparams_dec->szMode = mode;
+	confparams_dec->losslessCompressor = losslessCompressor;
+	
 	if(mode==SZ_TEMPORAL_COMPRESSION)
 	{
 		confparams_dec->szMode = SZ_TEMPORAL_COMPRESSION;
@@ -94,6 +101,8 @@ int new_TightDataPointStorageD_fromFlatBytes(TightDataPointStorageD **this, unsi
 	}
 	index += MetaDataByteLength;
 
+	int isRandomAccess = (sameRByte >> 7) & 0x01;
+
 	unsigned char dsLengthBytes[8];
 	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
 		dsLengthBytes[i] = flatBytes[index++];
@@ -121,6 +130,13 @@ int new_TightDataPointStorageD_fromFlatBytes(TightDataPointStorageD **this, unsi
 	else
 		(*this)->allSameData = 0;
 		
+	if(isRandomAccess == 1)
+	{
+		(*this)->raBytes_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE;
+		(*this)->raBytes = &(flatBytes[index]);
+		return errorBoundMode;
+	}					
+		
 	int rtype_ = sameRByte & 0x08; //1000		
 
 	unsigned char byteBuf[8];
@@ -204,12 +220,16 @@ int new_TightDataPointStorageD_fromFlatBytes(TightDataPointStorageD **this, unsi
 	{
 		(*this)->leadNumArray_size = (logicLeadNumBitsNum >> 3) + 1;
 	}
+	
+	int minLogValueSize = 0;
+	if(errorBoundMode>=PW_REL)
+		minLogValueSize = 8;
 
 	if ((*this)->rtypeArray != NULL) 
 	{
 		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 8 - 1 - 8 
 				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - 8 - (*this)->rtypeArray_size 
-				- (*this)->typeArray_size - (*this)->leadNumArray_size
+				- minLogValueSize - (*this)->typeArray_size - (*this)->leadNumArray_size
 				- (*this)->exactMidBytes_size - pwrErrBoundBytes_size;
 		for (i = 0; i < (*this)->rtypeArray_size; i++)
 			(*this)->rtypeArray[i] = flatBytes[index++];
@@ -217,10 +237,15 @@ int new_TightDataPointStorageD_fromFlatBytes(TightDataPointStorageD **this, unsi
 	else
 	{
 		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 8 - 1 - 8
-				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - (*this)->typeArray_size
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - minLogValueSize - (*this)->typeArray_size
 				- (*this)->leadNumArray_size - (*this)->exactMidBytes_size - pwrErrBoundBytes_size;
 	}	
 
+	if(errorBoundMode >= PW_REL){
+		(*this)->minLogValue = bytesToDouble(&flatBytes[index]);
+		index+=8;
+	}
+
 	(*this)->typeArray = &flatBytes[index];
 	//retrieve the number of states (i.e., stateNum)
 	(*this)->allNodes = bytesToInt_bigEndian((*this)->typeArray); //the first 4 bytes store the stateNum
@@ -422,6 +447,13 @@ void convertTDPStoBytes_double(TightDataPointStorageD* tdps, unsigned char* byte
 	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
 		bytes[k++] = exactMidBytesLength[i];
 
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		doubleToBytes(exactMidBytesLength, tdps->minLogValue);
+		for(i = 0;i < 8; i++)
+			bytes[k++] = exactMidBytesLength[i];
+	}
+
 	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
 	k += tdps->typeArray_size;
 	if(confparams_cpr->errorBoundMode>=PW_REL)
@@ -521,6 +553,14 @@ void convertTDPStoBytes_double_reserve(TightDataPointStorageD* tdps, unsigned ch
 	
 	memcpy(&(bytes[k]), tdps->rtypeArray, tdps->rtypeArray_size);
 	k += tdps->rtypeArray_size;		
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		doubleToBytes(exactMidBytesLength, tdps->minLogValue);
+		for(i = 0;i < 8; i++)
+			bytes[k++] = exactMidBytesLength[i];
+	}
+	
 	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
 	k += tdps->typeArray_size;
 	if(confparams_cpr->errorBoundMode>=PW_REL)
@@ -583,15 +623,19 @@ void convertTDPStoFlatBytes_double(TightDataPointStorageD *tdps, unsigned char**
 	{
 		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
 		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+
+		int minLogValueSize = 0;
 		if(confparams_cpr->errorBoundMode>=PW_REL)
 		{			
 			segmentL = exe_params->SZ_SIZE_TYPE;
 			radExpoL = 1;
 			pwrBoundArrayL = 4;
+			minLogValueSize = 8;
 		}
 
 		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 8 + 1 + 8 
 				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE 
+				+ minLogValueSize /*max absolute log value*/
 				+ tdps->typeArray_size + tdps->leadNumArray_size
 				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
 
@@ -605,16 +649,18 @@ void convertTDPStoFlatBytes_double(TightDataPointStorageD *tdps, unsigned char**
 	{
 		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
 		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		int minLogValueSize = 0;
 		if(confparams_cpr->errorBoundMode>=PW_REL)
 		{
 			segmentL = exe_params->SZ_SIZE_TYPE;
 			radExpoL = 1;
 			pwrBoundArrayL = 4;
+			minLogValueSize = 8;
 		}
 
 		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 8 + 1 + 8 
 				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + 8 + tdps->rtypeArray_size
-				+ tdps->typeArray_size + tdps->leadNumArray_size 
+				+ minLogValueSize + tdps->typeArray_size + tdps->leadNumArray_size 
 				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
 
 		sameByte = (unsigned char) (sameByte | 0x08); // 00001000, the 4th bit
diff --git a/thirdparty/SZ/sz/src/TightDataPointStorageF.c b/thirdparty/SZ/sz/src/TightDataPointStorageF.c
index 23a69b8a81c67e52717d1e834461edbe916df7d2..a30f8d93ef8a565c0602458e0de10887fbcce8e4 100644
--- a/thirdparty/SZ/sz/src/TightDataPointStorageF.c
+++ b/thirdparty/SZ/sz/src/TightDataPointStorageF.c
@@ -46,6 +46,9 @@ void new_TightDataPointStorageF_Empty(TightDataPointStorageF **this)
 	(*this)->segment_size = 0;
 	(*this)->pwrErrBoundBytes = NULL;
 	(*this)->pwrErrBoundBytes_size = 0;	
+	
+	(*this)->raBytes = NULL;
+	(*this)->raBytes_size = 0;
 }
 
 int new_TightDataPointStorageF_fromFlatBytes(TightDataPointStorageF **this, unsigned char* flatBytes, size_t flatBytesLength)
@@ -81,10 +84,13 @@ int new_TightDataPointStorageF_fromFlatBytes(TightDataPointStorageF **this, unsi
 	sz_params* params = convertBytesToSZParams(&(flatBytes[index]));
 	int mode = confparams_dec->szMode;
 	int predictionMode = confparams_dec->predictionMode;
+	int losslessCompressor = confparams_dec->losslessCompressor;
 	if(confparams_dec!=NULL)
 		free(confparams_dec);
 	confparams_dec = params;
 	confparams_dec->szMode = mode;
+	confparams_dec->losslessCompressor = losslessCompressor;
+
 	if(mode==SZ_TEMPORAL_COMPRESSION)
 	{
 		confparams_dec->szMode = SZ_TEMPORAL_COMPRESSION;
@@ -92,6 +98,8 @@ int new_TightDataPointStorageF_fromFlatBytes(TightDataPointStorageF **this, unsi
 	}
 	
 	index += MetaDataByteLength;
+
+	int isRandomAccess = (sameRByte >> 7) & 0x01;
 	
 	unsigned char dsLengthBytes[8];
 	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
@@ -117,6 +125,12 @@ int new_TightDataPointStorageF_fromFlatBytes(TightDataPointStorageF **this, unsi
 	}
 	else
 		(*this)->allSameData = 0;
+	if(isRandomAccess == 1)
+	{
+		(*this)->raBytes_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE;
+		(*this)->raBytes = &(flatBytes[index]);
+		return errorBoundMode;
+	}			
 
 	int rtype_ = sameRByte & 0x08;		//=00001000
 	unsigned char byteBuf[8];
@@ -165,7 +179,7 @@ int new_TightDataPointStorageF_fromFlatBytes(TightDataPointStorageF **this, unsi
 	{
 		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++) 
 			byteBuf[i] = flatBytes[index++];
-		(*this)->rtypeArray_size = bytesToSize(byteBuf);//(ST)		
+		(*this)->rtypeArray_size = bytesToSize(byteBuf);//(ST)
 	}
 	else
 		(*this)->rtypeArray_size = 0;
@@ -199,11 +213,15 @@ int new_TightDataPointStorageF_fromFlatBytes(TightDataPointStorageF **this, unsi
 		(*this)->leadNumArray_size = (logicLeadNumBitsNum >> 3) + 1;
 	}
 
+	int minLogValueSize = 0;
+	if(errorBoundMode>=PW_REL)
+		minLogValueSize = 4;
+
 	if ((*this)->rtypeArray != NULL) 
 	{
 		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 4 - 1 - 8 
-				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - 4 - (*this)->rtypeArray_size
-				- (*this)->typeArray_size - (*this)->leadNumArray_size
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - minLogValueSize - exe_params->SZ_SIZE_TYPE - 4 - (*this)->rtypeArray_size
+				- minLogValueSize - (*this)->typeArray_size - (*this)->leadNumArray_size
 				- (*this)->exactMidBytes_size - pwrErrBoundBytes_size;
 		for (i = 0; i < (*this)->rtypeArray_size; i++)
 			(*this)->rtypeArray[i] = flatBytes[index++];
@@ -211,9 +229,15 @@ int new_TightDataPointStorageF_fromFlatBytes(TightDataPointStorageF **this, unsi
 	else
 	{
 		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 4 - 1 - 8 
-				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - (*this)->typeArray_size
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - minLogValueSize - (*this)->typeArray_size
 				- (*this)->leadNumArray_size - (*this)->exactMidBytes_size - pwrErrBoundBytes_size;
-	}	
+	}
+
+	if(errorBoundMode>=PW_REL)
+	{
+		(*this)->minLogValue = bytesToFloat(&flatBytes[index]);
+		index+=4;
+	}
 
 	(*this)->typeArray = &flatBytes[index]; 
 	//retrieve the number of states (i.e., stateNum)
@@ -418,6 +442,13 @@ void convertTDPStoBytes_float(TightDataPointStorageF* tdps, unsigned char* bytes
 	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
 		bytes[k++] = exactMidBytesLength[i];
 
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		floatToBytes(exactMidBytesLength, tdps->minLogValue);
+		for(i=0;i<4;i++)
+			bytes[k++] = exactMidBytesLength[i];
+	}
+
 	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
 	k += tdps->typeArray_size;
 	if(confparams_cpr->errorBoundMode>=PW_REL)
@@ -519,6 +550,14 @@ void convertTDPStoBytes_float_reserve(TightDataPointStorageF* tdps, unsigned cha
 
 	memcpy(&(bytes[k]), tdps->rtypeArray, tdps->rtypeArray_size);
 	k += tdps->rtypeArray_size;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		floatToBytes(exactMidBytesLength, tdps->minLogValue);
+		for(i=0;i<4;i++)
+			bytes[k++] = exactMidBytesLength[i];
+	}	
+	
 	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
 	k += tdps->typeArray_size;
 	if(confparams_cpr->errorBoundMode>=PW_REL)
@@ -581,15 +620,17 @@ void convertTDPStoFlatBytes_float(TightDataPointStorageF *tdps, unsigned char**
 	{
 		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
 		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		int minLogValueSize = 0;
 		if(confparams_cpr->errorBoundMode>=PW_REL)
 		{			
 			segmentL = exe_params->SZ_SIZE_TYPE;
 			radExpoL = 1;
 			pwrBoundArrayL = 4;
+			minLogValueSize = 4;
 		}
 
 		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 4 + 1 + 8 
-				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE  
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + minLogValueSize
 				+ tdps->typeArray_size + tdps->leadNumArray_size 
 				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
 
@@ -603,16 +644,18 @@ void convertTDPStoFlatBytes_float(TightDataPointStorageF *tdps, unsigned char**
 	{
 		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;		
 		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		int minLogValueSize = 0;
 		if(confparams_cpr->errorBoundMode>=PW_REL)
 		{
 			segmentL = exe_params->SZ_SIZE_TYPE;
 			radExpoL = 1;
 			pwrBoundArrayL = 4;
+			minLogValueSize = 4;
 		}
 
 		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 4 + 1 + 8 
 				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + 4 + tdps->rtypeArray_size
-				+ tdps->typeArray_size + tdps->leadNumArray_size
+				+ minLogValueSize + tdps->typeArray_size + tdps->leadNumArray_size
 				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
 
 		sameByte = (unsigned char) (sameByte | 0x08); // 00001000, the 4th bit
@@ -721,7 +764,7 @@ void convertTDPStoFlatBytes_float_args(TightDataPointStorageF *tdps, unsigned ch
  * to free the memory used in the compression
  * */
 void free_TightDataPointStorageF(TightDataPointStorageF *tdps)
-{			
+{
 	if(tdps->rtypeArray!=NULL)
 		free(tdps->rtypeArray);
 	if(tdps->typeArray!=NULL)
diff --git a/thirdparty/SZ/sz/src/TypeManager.c b/thirdparty/SZ/sz/src/TypeManager.c
index 42474fb00e6f67de9a55a769aea565ebe73e0682..638f3cbcee4ff3d792701c0cfd79c364af10573c 100644
--- a/thirdparty/SZ/sz/src/TypeManager.c
+++ b/thirdparty/SZ/sz/src/TypeManager.c
@@ -43,7 +43,33 @@ size_t convertIntArray2ByteArray_fast_1b(unsigned char* intArray, size_t intArra
 	}
 	return byteLength;
 }
-	
+
+size_t convertIntArray2ByteArray_fast_1b_to_result(unsigned char* intArray, size_t intArrayLength, unsigned char *result)
+{
+	size_t byteLength = 0;
+	size_t i, j; 
+	if(intArrayLength%8==0)
+		byteLength = intArrayLength/8;
+	else
+		byteLength = intArrayLength/8+1;
+		
+	size_t n = 0;
+	int tmp, type;
+	for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+		for(j = 0;j<8&&n<intArrayLength;j++)
+		{
+			type = intArray[n];
+			if(type == 1)
+				tmp = (tmp | (1 << (7-j)));
+			n++;
+		}
+    	result[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
 void convertByteArray2IntArray_fast_1b(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)	
 {
     if(intArrayLength > byteArrayLength*8)
@@ -148,6 +174,46 @@ size_t convertIntArray2ByteArray_fast_2b(unsigned char* timeStepType, size_t tim
 	return byteLength;
 }
 
+size_t convertIntArray2ByteArray_fast_2b_inplace(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char *result)
+{
+	size_t i, j, byteLength = 0;
+	if(timeStepTypeLength%4==0)
+		byteLength = timeStepTypeLength*2/8;
+	else
+		byteLength = timeStepTypeLength*2/8+1;
+
+	size_t n = 0;
+	for(i = 0;i<byteLength;i++)
+	{
+		int tmp = 0;
+		for(j = 0;j<4&&n<timeStepTypeLength;j++)
+		{
+			int type = timeStepType[n];
+			switch(type)
+			{
+			case 0: 
+				
+				break;
+			case 1:
+				tmp = (tmp | (1 << (6-j*2)));
+				break;
+			case 2:
+				tmp = (tmp | (2 << (6-j*2)));
+				break;
+			case 3:
+				tmp = (tmp | (3 << (6-j*2)));
+				break;
+			default:
+				printf("Error: wrong timestep type...: type[%zu]=%d\n", n, type);
+				exit(0);
+			}
+			n++;
+		}
+		result[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
 void convertByteArray2IntArray_fast_2b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)
 {
 	if(stepLength > byteArrayLength*4)
@@ -291,7 +357,7 @@ void convertByteArray2IntArray_fast_3b(size_t stepLength, unsigned char* byteArr
 	}
 }
 
-int getLeftMovingSteps(size_t k, unsigned char resiBitLength)
+inline int getLeftMovingSteps(size_t k, unsigned char resiBitLength)
 {
 	return 8 - k%8 - resiBitLength;
 }
diff --git a/thirdparty/SZ/sz/src/callZlib.c b/thirdparty/SZ/sz/src/callZlib.c
index 0e392b791564b619fa96d60e678b8a6b1888d225..4e4bb6f2729e401d7779bd62c5ee5601775992fc 100644
--- a/thirdparty/SZ/sz/src/callZlib.c
+++ b/thirdparty/SZ/sz/src/callZlib.c
@@ -27,6 +27,25 @@
     } \
 }
 
+int isZlibFormat(unsigned char magic1, unsigned char magic2)
+{
+	if(magic1==104&&magic2==5) //DC+BS
+		return 1;
+	if(magic1==104&&magic2==129) //DC+DC
+		return 1;
+	if(magic1==104&&magic2==222) //DC+BC
+		return 1;		
+	if(magic1==120&&magic2==1) //BC+BS
+		return 1;
+	if(magic1==120&&magic2==94) //BC+? 
+		return 1;		
+	if(magic1==120&&magic2==156) //BC+DC
+		return 1;
+	if(magic1==120&&magic2==218) //BC+BS
+		return 1;
+	return 0;
+}
+
 /*zlib_compress() is only valid for median-size data compression. */
 unsigned long zlib_compress(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level)
 {	
@@ -195,6 +214,9 @@ unsigned long zlib_compress5(unsigned char* data, unsigned long dataLength, unsi
 	strm.zfree = Z_NULL;
 	strm.opaque = Z_NULL;
 	ret = deflateInit(&strm, level);
+	//int windowBits = 15;
+    //ret = deflateInit2(&strm, level, Z_DEFLATED, windowBits, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);//Z_FIXED); //Z_DEFAULT_STRATEGY
+
 	if (ret != Z_OK)
 		return ret;
 
diff --git a/thirdparty/SZ/sz/src/conf.c b/thirdparty/SZ/sz/src/conf.c
index 8e6959d70f68bff12fae174da79c3cdc5df65638..cc5ccf38fc36078287b5ad61bc4fcf17a6f83940 100644
--- a/thirdparty/SZ/sz/src/conf.c
+++ b/thirdparty/SZ/sz/src/conf.c
@@ -102,11 +102,16 @@ int SZ_ReadConf(const char* sz_cfgFile) {
 		confparams_cpr->sampleDistance = 100;
 		
 		confparams_cpr->szMode = SZ_BEST_COMPRESSION;
-		
-		confparams_cpr->gzipMode = 1; //fast mode
+		confparams_cpr->losslessCompressor = ZSTD_COMPRESSOR; //other option: GZIP_COMPRESSOR;
+		if(confparams_cpr->losslessCompressor==ZSTD_COMPRESSOR)
+			confparams_cpr->gzipMode = 3; //fast mode
+		else
+			confparams_cpr->gzipMode = 1; //high speed mode
 		
 		confparams_cpr->errorBoundMode = PSNR;
 		confparams_cpr->psnr = 90;
+		confparams_cpr->absErrBound = 1E-4;
+		confparams_cpr->relBoundRatio = 1E-4;
 		
 		confparams_cpr->pw_relBoundRatio = 1E-3;
 		confparams_cpr->segment_size = 36;
@@ -114,6 +119,8 @@ int SZ_ReadConf(const char* sz_cfgFile) {
 		confparams_cpr->pwr_type = SZ_PWR_MIN_TYPE;
 		
 		confparams_cpr->snapshotCmprStep = 5;
+		
+		sz_with_regression = SZ_WITH_LINEAR_REGRESSION;
 	
 		return SZ_SCES;
 	}
@@ -212,7 +219,26 @@ int SZ_ReadConf(const char* sz_cfgFile) {
 			return SZ_NSCS;	
 		}
 		
-		modeBuf = iniparser_getstring(ini, "PARAMETER:gzipMode", NULL);
+		modeBuf = iniparser_getstring(ini, "PARAMETER:losslessCompressor", "ZSTD_COMPRESSOR");
+		if(strcmp(modeBuf, "GZIP_COMPRESSOR")==0)
+			confparams_cpr->losslessCompressor = GZIP_COMPRESSOR;
+		else if(strcmp(modeBuf, "ZSTD_COMPRESSOR")==0)
+			confparams_cpr->losslessCompressor = ZSTD_COMPRESSOR;
+		else
+		{
+			printf("[SZ] Error: Wrong losslessCompressor setting (please check sz.config file)\n");\
+			printf("No Such a lossless compressor: %s\n", modeBuf);
+			iniparser_freedict(ini);
+			return SZ_NSCS;	
+		}		
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:withLinearRegression", "YES");
+		if(strcmp(modeBuf, "YES")==0 || strcmp(modeBuf, "yes")==0)
+			sz_with_regression = SZ_WITH_LINEAR_REGRESSION;
+		else
+			sz_with_regression = SZ_NO_REGRESSION;
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:gzipMode", "Gzip_BEST_SPEED");
 		if(modeBuf==NULL)
 		{
 			printf("[SZ] Error: Null Gzip mode setting (please check sz.config file)\n");
@@ -233,6 +259,29 @@ int SZ_ReadConf(const char* sz_cfgFile) {
 			return SZ_NSCS;
 		}
 		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:zstdMode", "Zstd_HIGH_SPEED");		
+		if(modeBuf==NULL)
+		{
+			printf("[SZ] Error: Null Zstd mode setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;					
+		}		
+		else if(strcmp(modeBuf, "Zstd_BEST_SPEED")==0)
+			confparams_cpr->gzipMode = 1;
+		else if(strcmp(modeBuf, "Zstd_HIGH_SPEED")==0)
+			confparams_cpr->gzipMode = 3;
+		else if(strcmp(modeBuf, "Zstd_HIGH_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 19;
+		else if(strcmp(modeBuf, "Zstd_BEST_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 22;			
+		else if(strcmp(modeBuf, "Zstd_DEFAULT_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 3;
+		else
+		{
+			printf("[SZ] Error: Wrong zstd Mode (please check sz.config file)\n");
+			return SZ_NSCS;
+		}		
+		
 		//TODO
 		confparams_cpr->snapshotCmprStep = (int)iniparser_getint(ini, "PARAMETER:snapshotCmprStep", 5);
 				
diff --git a/thirdparty/SZ/sz/src/dataCompression.c b/thirdparty/SZ/sz/src/dataCompression.c
index 0bb5ce1a6140d7bb51a12d21491ceee61ec24287..212a104a1079e0efa77cfe6d2af1a54503fa72e6 100644
--- a/thirdparty/SZ/sz/src/dataCompression.c
+++ b/thirdparty/SZ/sz/src/dataCompression.c
@@ -66,14 +66,14 @@ long computeRangeSize_int(void* oriData, int dataType, size_t size, int64_t* val
 	else if(dataType == SZ_UINT32)
 	{
 		unsigned int* data = (unsigned int*)oriData;
-		int data_; 
+		unsigned int data_; 
 		min = data[0], max = min;
 		computeMinMax(data);
 	}
 	else if(dataType == SZ_INT32)
 	{
 		int* data = (int*)oriData;
-		unsigned int data_; 
+		int data_; 
 		min = data[0], max = min;
 		computeMinMax(data);
 	}
@@ -595,3 +595,277 @@ int initRandomAccessBytes(unsigned char* raBytes)
 
         return k;
 }
+
+//The following functions are float-precision version of dealing with the unpredictable data points 
+int generateLossyCoefficients_float(float* oriData, double precision, size_t nbEle, int* reqBytesLength, int* resiBitsLength, float* medianValue, float* decData)
+{
+	float valueRangeSize;
+	
+	computeRangeSize_float(oriData, nbEle, &valueRangeSize, medianValue);
+	short radExpo = getExponent_float(valueRangeSize/2);
+	
+	int reqLength;
+	computeReqLength_float(precision, radExpo, &reqLength, medianValue);
+	
+	*reqBytesLength = reqLength/8;
+	*resiBitsLength = reqLength%8;
+	
+	size_t i = 0;
+	for(i = 0;i < nbEle;i++)
+	{
+		float normValue = oriData[i] - *medianValue;
+
+		lfloat lfBuf;
+		lfBuf.value = normValue;
+				
+		int ignBytesLength = 32 - reqLength;
+		if(ignBytesLength<0)
+			ignBytesLength = 0;
+			
+		lfBuf.ivalue = (lfBuf.ivalue >> ignBytesLength) << ignBytesLength;
+		
+		//float tmpValue = lfBuf.value;
+		
+		decData[i] = lfBuf.value + *medianValue;
+	}
+	return reqLength;
+}	
+		
+/**
+ * @param float* oriData: inplace argument (input / output)
+ * 
+ * */		
+int compressExactDataArray_float(float* oriData, double precision, size_t nbEle, unsigned char** leadArray, unsigned char** midArray, unsigned char** resiArray, 
+int reqLength, int reqBytesLength, int resiBitsLength, float medianValue)
+{
+	//allocate memory for coefficient compression arrays
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	unsigned char preDataBytes[4] = {0,0,0,0};	
+
+	//allocate memory for vce and lce
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));	
+
+	size_t i = 0;
+	for(i = 0;i < nbEle;i++)
+	{
+		compressSingleFloatValue(vce, oriData[i], precision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		oriData[i] = vce->data;
+	}
+	convertDIAtoInts(exactLeadNumArray, leadArray);
+	convertDBAtoBytes(exactMidByteArray,midArray);
+	convertDIAtoInts(resiBitArray, resiArray);
+
+	size_t midArraySize = exactMidByteArray->size;
+	
+	free(vce);
+	free(lce);
+	
+	free_DIA(exactLeadNumArray);
+	free_DBA(exactMidByteArray);
+	free_DIA(resiBitArray);
+	
+	return midArraySize;
+}
+
+void decompressExactDataArray_float(unsigned char* leadNum, unsigned char* exactMidBytes, unsigned char* residualMidBits, size_t nbEle, int reqLength, float medianValue, float** decData)
+{
+	*decData = (float*)malloc(nbEle*sizeof(float));
+	size_t i = 0, j = 0, k = 0, l = 0, p = 0, curByteIndex = 0;
+	float exactData = 0;
+	unsigned char preBytes[4] = {0,0,0,0};
+	unsigned char curBytes[4];
+	int resiBits; 
+	unsigned char leadingNum;		
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	
+	for(i = 0; i<nbEle;i++)
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data	
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToFloat(curBytes);
+		(*decData)[i] = exactData + medianValue;
+		memcpy(preBytes,curBytes,4);
+	}	
+}
+
+//double-precision version of dealing with unpredictable data points in sz 2.0
+int generateLossyCoefficients_double(double* oriData, double precision, size_t nbEle, int* reqBytesLength, int* resiBitsLength, double* medianValue, double* decData)
+{
+	double valueRangeSize;
+	
+	computeRangeSize_double(oriData, nbEle, &valueRangeSize, medianValue);
+	short radExpo = getExponent_double(valueRangeSize/2);
+	
+	int reqLength;
+	computeReqLength_double(precision, radExpo, &reqLength, medianValue);
+	
+	*reqBytesLength = reqLength/8;
+	*resiBitsLength = reqLength%8;
+	
+	size_t i = 0;
+	for(i = 0;i < nbEle;i++)
+	{
+		double normValue = oriData[i] - *medianValue;
+
+		ldouble ldBuf;
+		ldBuf.value = normValue;
+				
+		int ignBytesLength = 64 - reqLength;
+		if(ignBytesLength<0)
+			ignBytesLength = 0;
+			
+		ldBuf.lvalue = (ldBuf.lvalue >> ignBytesLength) << ignBytesLength;
+		
+		decData[i] = ldBuf.value + *medianValue;
+	}
+	return reqLength;
+}	
+		
+/**
+ * @param double* oriData: inplace argument (input / output)
+ * 
+ * */		
+int compressExactDataArray_double(double* oriData, double precision, size_t nbEle, unsigned char** leadArray, unsigned char** midArray, unsigned char** resiArray, 
+int reqLength, int reqBytesLength, int resiBitsLength, double medianValue)
+{
+	//allocate memory for coefficient compression arrays
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	unsigned char preDataBytes[8] = {0,0,0,0,0,0,0,0};	
+
+	//allocate memory for vce and lce
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));	
+
+	size_t i = 0;
+	for(i = 0;i < nbEle;i++)
+	{
+		compressSingleDoubleValue(vce, oriData[i], precision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		oriData[i] = vce->data;
+	}
+	convertDIAtoInts(exactLeadNumArray, leadArray);
+	convertDBAtoBytes(exactMidByteArray,midArray);
+	convertDIAtoInts(resiBitArray, resiArray);
+
+	size_t midArraySize = exactMidByteArray->size;
+	
+	free(vce);
+	free(lce);
+	
+	free_DIA(exactLeadNumArray);
+	free_DBA(exactMidByteArray);
+	free_DIA(resiBitArray);
+	
+	return midArraySize;
+}
+
+void decompressExactDataArray_double(unsigned char* leadNum, unsigned char* exactMidBytes, unsigned char* residualMidBits, size_t nbEle, int reqLength, double medianValue, double** decData)
+{
+	*decData = (double*)malloc(nbEle*sizeof(double));
+	size_t i = 0, j = 0, k = 0, l = 0, p = 0, curByteIndex = 0;
+	double exactData = 0;
+	unsigned char preBytes[8] = {0,0,0,0,0,0,0,0};
+	unsigned char curBytes[8];
+	int resiBits; 
+	unsigned char leadingNum;		
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	
+	for(i = 0; i<nbEle;i++)
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data	
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToDouble(curBytes);
+		(*decData)[i] = exactData + medianValue;
+		memcpy(preBytes,curBytes,8);
+	}
+}
diff --git a/thirdparty/SZ/sz/src/sz.c b/thirdparty/SZ/sz/src/sz.c
index 97cb00d8a5725090914964e92229bff939cdb1e5..6cdc35b08b1dfb46ea62acc78f57fc39a78810e6 100644
--- a/thirdparty/SZ/sz/src/sz.c
+++ b/thirdparty/SZ/sz/src/sz.c
@@ -22,6 +22,7 @@
 #include "rw.h"
 #include "Huffman.h"
 #include "conf.h"
+#include "utility.h"
 //#include "CurveFillingCompressStorage.h"
 
 int versionNumber[4] = {SZ_VER_MAJOR,SZ_VER_MINOR,SZ_VER_BUILD,SZ_VER_REVISION};
@@ -36,6 +37,8 @@ sz_params *confparams_dec = NULL; //used for decompression
 
 sz_exedata *exe_params = NULL;
 
+int sz_with_regression = SZ_WITH_LINEAR_REGRESSION; //SZ_NO_REGRESSION
+
 /*following global variables are desgined for time-series based compression*/
 /*sz_varset is not used in the single-snapshot data compression*/
 SZ_VarSet* sz_varset = NULL;
@@ -69,31 +72,15 @@ int SZ_Init(const char *configFilePath)
 
 int SZ_Init_Params(sz_params *params)
 {
-	int x = 1;
-	char *y = (char*)&x;
-	int endianType = BIG_ENDIAN_SYSTEM;
-	if(*y==1) endianType = LITTLE_ENDIAN_SYSTEM;
+	SZ_Init(NULL);
 
-	sysEndianType = endianType;
-	exe_params->SZ_SIZE_TYPE = sizeof(size_t);
+	if(params->losslessCompressor!=GZIP_COMPRESSOR && params->losslessCompressor!=ZSTD_COMPRESSOR)
+		params->losslessCompressor = ZSTD_COMPRESSOR;
 
-	// set default values
-	if(params->max_quant_intervals > 0) 
+	if(params->max_quant_intervals > 0)
 		params->maxRangeRadius = params->max_quant_intervals/2;
-	else
-		params->max_quant_intervals = params->maxRangeRadius*2;
-
-	exe_params->intvCapacity = params->maxRangeRadius*2;
-	exe_params->intvRadius = params->maxRangeRadius;
-
-	if(params->quantization_intervals>0)
-	{
-		updateQuantizationInfo(params->quantization_intervals);
-		exe_params->optQuantMode = 0;
-	}
-	else
-		exe_params->optQuantMode = 1;
-
+		
+	memcpy(confparams_cpr, params, sizeof(sz_params));
 
 	if(params->quantization_intervals%2!=0)
 	{
@@ -101,9 +88,6 @@ int SZ_Init_Params(sz_params *params)
 		return SZ_NSCS;
 	}
 
-	confparams_cpr = (sz_params*)malloc(sizeof(sz_params));
-	memcpy(confparams_cpr, params, sizeof(sz_params));	
-
 	return SZ_SCES;
 }
 
@@ -535,6 +519,14 @@ sz_metadata* SZ_getMetadata(unsigned char* bytes)
 	isConstant = sameRByte & 0x01;
 	//confparams_dec->szMode = (sameRByte & 0x06)>>1;
 	isLossless = (sameRByte & 0x10)>>4;
+	
+	int isRandomAccess = (sameRByte >> 7) & 0x01;
+	
+	if(exe_params==NULL)
+	{
+		exe_params = (sz_exedata *)malloc(sizeof(struct sz_exedata));
+		memset(exe_params, 0, sizeof(struct sz_exedata));
+	}
 	exe_params->SZ_SIZE_TYPE = ((sameRByte & 0x40)>>6)==1?8:4;
 	
 	sz_params* params = convertBytesToSZParams(&(bytes[index]));
@@ -547,8 +539,8 @@ sz_metadata* SZ_getMetadata(unsigned char* bytes)
 		index++; //jump to the dataLength info byte address
 	dataSeriesLength = bytesToSize(&(bytes[index]));// 4 or 8	
 	index += exe_params->SZ_SIZE_TYPE;
-	index += 4; //max_quant_intervals
-	
+	//index += 4; //max_quant_intervals
+
 	sz_metadata* metadata = (sz_metadata*)malloc(sizeof(struct sz_metadata));
 	
 	metadata->versionNumber[0] = versions[0];
@@ -564,19 +556,27 @@ sz_metadata* SZ_getMetadata(unsigned char* bytes)
 	int defactoNBBins = 0; //real # bins
 	if(isConstant==0 && isLossless==0)
 	{
-		int radExpoL = 0, segmentL = 0, pwrErrBoundBytesL = 0;
-		if(metadata->conf_params->errorBoundMode >= PW_REL)
+		if(isRandomAccess==1)
 		{
-			radExpoL = 1;
-			segmentL = exe_params->SZ_SIZE_TYPE;
-			pwrErrBoundBytesL = 4;
+			unsigned char* raBytes = &(bytes[index]);
+			defactoNBBins = bytesToInt_bigEndian(raBytes + sizeof(int) + sizeof(double));
 		}
-		
-		int offset_typearray = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrErrBoundBytesL + 4 + 4 + 1 + 8 
-				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE;
-		size_t nodeCount = bytesToInt_bigEndian(bytes+offset_typearray);
-		defactoNBBins = (nodeCount+1)/2;
-	}
+		else
+		{
+			int radExpoL = 0, segmentL = 0, pwrErrBoundBytesL = 0;
+			if(metadata->conf_params->errorBoundMode >= PW_REL)
+			{
+				radExpoL = 1;
+				segmentL = exe_params->SZ_SIZE_TYPE;
+				pwrErrBoundBytesL = 4;
+			}
+			
+			int offset_typearray = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrErrBoundBytesL + 4 + (4 + params->dataType*4) + 1 + 8 
+					+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + 4;
+			defactoNBBins = bytesToInt_bigEndian(bytes+offset_typearray);			
+		}
+
+	}	
 	
 	metadata->defactoNBBins = defactoNBBins;
 	return metadata;
@@ -779,23 +779,6 @@ size_t compute_total_batch_size()
 	return totalSize;
 }
 
-int isZlibFormat(unsigned char magic1, unsigned char magic2)
-{
-	if(magic1==104&&magic2==5) //DC+BS
-		return 1;
-	if(magic1==104&&magic2==129) //DC+DC
-		return 1;
-	if(magic1==104&&magic2==222) //DC+BC
-		return 1;
-	if(magic1==120&&magic2==1) //BC+BS
-		return 1;
-	if(magic1==120&&magic2==156) //BC+DC
-		return 1;
-	if(magic1==120&&magic2==218) //BC+BS
-		return 1;
-	return 0;
-}
-
 void SZ_registerVar(char* varName, int dataType, void* data, 
 			int errBoundMode, double absErrBound, double relBoundRatio, double pwRelBoundRatio, 
 			size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
diff --git a/thirdparty/SZ/sz/src/sz_double.c b/thirdparty/SZ/sz/src/sz_double.c
index 51819bd44f6ad83485a42267d63e24a1bc600319..d8bf87590de4884dd29df4c8f86115a3618dd434 100644
--- a/thirdparty/SZ/sz/src/sz_double.c
+++ b/thirdparty/SZ/sz/src/sz_double.c
@@ -25,6 +25,7 @@
 #include "zlib.h"
 #include "rw.h"
 #include "sz_double_ts.h"
+#include "utility.h"
 
 unsigned char* SZ_skip_compress_double(double* data, size_t dataLength, size_t* outSize)
 {
@@ -328,7 +329,7 @@ size_t dataLength, double realPrecision, double valueRangeSize, double medianVal
 		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
 		pred = last3CmprsData[0];
 		predAbsErr = fabs(curData - pred);	
-		if(predAbsErr<=checkRadius)
+		if(predAbsErr<checkRadius)
 		{
 			state = (predAbsErr/realPrecision+1)/2;
 			if(curData>=pred)
@@ -1516,8 +1517,8 @@ int errBoundMode, double absErr_Bound, double relBoundRatio, double pwrErrRatio)
 		{
 			if(errBoundMode>=PW_REL)
 			{
-				//SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr(newByteData, oriData, realPrecision, r1, outSize, min, max);
-				SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(newByteData, oriData, r1, absErr_Bound, relBoundRatio, pwrErrRatio, valueRangeSize, medianValue, outSize);				
+				SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr(newByteData, oriData, pwrErrRatio, r1, outSize, min, max);
+				//SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(newByteData, oriData, r1, absErr_Bound, relBoundRatio, pwrErrRatio, valueRangeSize, medianValue, outSize);				
 			}
 			else
 				SZ_compress_args_double_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, medianValue);
@@ -1562,7 +1563,7 @@ int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRa
 			exit(0);
 			return SZ_NSCS;
 		}
-	}				
+	}
 		
 	int status = SZ_SCES;
 	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
@@ -1600,13 +1601,12 @@ int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRa
 		{
 			if(confparams_cpr->errorBoundMode>=PW_REL)
 			{
-				//SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr(&tmpByteData, oriData, realPrecision, r1, &tmpOutSize, min, max);
-				SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(&tmpByteData, oriData, r1, absErr_Bound, relBoundRatio, pwRelBoundRatio, 
-				valueRangeSize, medianValue, &tmpOutSize);
+				SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r1, &tmpOutSize, min, max);
+				//SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(&tmpByteData, oriData, r1, absErr_Bound, relBoundRatio, pwRelBoundRatio, valueRangeSize, medianValue, &tmpOutSize);
 			}
 			else
 #ifdef HAVE_TIMECMPR
-				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)			
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
 					multisteps->compressionType = SZ_compress_args_double_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
 				else
 #endif
@@ -1616,40 +1616,58 @@ int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRa
 		if (r3==0)
 		{
 			if(confparams_cpr->errorBoundMode>=PW_REL)
-				SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr(&tmpByteData, oriData, realPrecision, r2, r1, &tmpOutSize, min, max);
+				SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r2, r1, &tmpOutSize, min, max);
 			else
 #ifdef HAVE_TIMECMPR
 				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)			
 					multisteps->compressionType = SZ_compress_args_double_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
 				else
 #endif
-					SZ_compress_args_double_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				{	
+					if(sz_with_regression == SZ_NO_REGRESSION)
+						SZ_compress_args_double_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+					else 
+						tmpByteData = SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(oriData, r2, r1, realPrecision, &tmpOutSize);		
+				}
 		}
 		else
 		if (r4==0)
 		{
 			if(confparams_cpr->errorBoundMode>=PW_REL)
-				SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr(&tmpByteData, oriData, realPrecision, r3, r2, r1, &tmpOutSize, min, max);
+				SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r3, r2, r1, &tmpOutSize, min, max);
 			else
 #ifdef HAVE_TIMECMPR
 				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
 					multisteps->compressionType = SZ_compress_args_double_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
 				else
 #endif
-					SZ_compress_args_double_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				{
+					if(sz_with_regression == SZ_NO_REGRESSION)
+						SZ_compress_args_double_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+					else 
+						tmpByteData = SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(oriData, r3, r2, r1, realPrecision, &tmpOutSize);
+				}
+					
+					
 		}
 		else
 		if (r5==0)
 		{
 			if(confparams_cpr->errorBoundMode>=PW_REL)
-				SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr(&tmpByteData, oriData, realPrecision, r4*r3, r2, r1, &tmpOutSize, min, max);
+				SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r4*r3, r2, r1, &tmpOutSize, min, max);
 			else
 #ifdef HAVE_TIMECMPR
 				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)			
 					multisteps->compressionType = SZ_compress_args_double_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
 				else
-#endif
-					SZ_compress_args_double_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+#endif	
+				{
+					if(sz_with_regression == SZ_NO_REGRESSION)
+						SZ_compress_args_double_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+					else 
+						tmpByteData = SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(oriData, r4*r3, r2, r1, realPrecision, &tmpOutSize);								
+				}
+		
 		}
 		else
 		{
@@ -1665,7 +1683,7 @@ int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRa
 		}
 		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
 		{
-			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
 			free(tmpByteData);
 		}
 		else
@@ -3121,11 +3139,8 @@ unsigned int optimize_intervals_double_3D_opt(double *oriData, size_t r1, size_t
 		if(radiusIndex>=confparams_cpr->maxRangeRadius)
 		{
 			radiusIndex = confparams_cpr->maxRangeRadius - 1;
-			//printf("radiusIndex=%d\n", radiusIndex);
 		}
 		intervals[radiusIndex]++;
-		// printf("TEST: %ld, i: %ld\tj: %ld\tk: %ld\n", data_pos - oriData);
-		// fflush(stdout);
 		offset_count += confparams_cpr->sampleDistance;
 		if(offset_count >= r3){
 			n2_count ++;
@@ -3141,9 +3156,6 @@ unsigned int optimize_intervals_double_3D_opt(double *oriData, size_t r1, size_t
 		}
 		else data_pos += confparams_cpr->sampleDistance;
 	}	
-	// printf("sample_count: %ld\n", sample_count);
-	// fflush(stdout);
-	// if(*max_freq < 0.15) *max_freq *= 2;
 	//compute the appropriate number
 	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
 	size_t sum = 0;
@@ -3161,7 +3173,6 @@ unsigned int optimize_intervals_double_3D_opt(double *oriData, size_t r1, size_t
 	if(powerOf2<32)
 		powerOf2 = 32;
 	free(intervals);
-	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
 	return powerOf2;
 }
 
@@ -3172,7 +3183,7 @@ unsigned int optimize_intervals_double_2D_opt(double *oriData, size_t r1, size_t
 	double pred_value = 0, pred_err;
 	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
 	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
-	size_t totalSampleSize = 0;//(r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	size_t totalSampleSize = 0;
 
 	size_t offset_count = confparams_cpr->sampleDistance - 1; // count r2 offset
 	size_t offset_count_2;
@@ -3226,12 +3237,11 @@ unsigned int optimize_intervals_double_1D_opt(double *oriData, size_t dataLength
 	double pred_value = 0, pred_err;
 	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
 	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
-	size_t totalSampleSize = 0;//dataLength/confparams_cpr->sampleDistance;
+	size_t totalSampleSize = 0;
 
 	double * data_pos = oriData + 2;
 	while(data_pos - oriData < dataLength){
 		totalSampleSize++;
-		//pred_value = 2*data_pos[-1] - data_pos[-2];
 		pred_value = data_pos[-1];
 		pred_err = fabs(pred_value - *data_pos);
 		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
@@ -3260,6 +3270,2063 @@ unsigned int optimize_intervals_double_1D_opt(double *oriData, size_t dataLength
 		powerOf2 = 32;
 	
 	free(intervals);
-	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
 	return powerOf2;
 }
+
+/*The above code is for sz 1.4.13; the following code is for sz 2.0*/
+unsigned int optimize_intervals_double_2D_with_freq_and_dense_pos(double *oriData, size_t r1, size_t r2, double realPrecision, double * dense_pos, double * max_freq, double * mean_freq)
+{	
+	double mean = 0.0;
+	size_t len = r1 * r2;
+	size_t mean_distance = (int) (sqrt(len));
+
+	double * data_pos = oriData;
+	size_t mean_count = 0;
+	while(data_pos - oriData < len){
+		mean += *data_pos;
+		mean_count ++;
+		data_pos += mean_distance;
+	}
+	if(mean_count > 0) mean /= mean_count;
+	size_t range = 8192;
+	size_t radius = 4096;
+	size_t * freq_intervals = (size_t *) malloc(range*sizeof(size_t));
+	memset(freq_intervals, 0, range*sizeof(size_t));
+
+	unsigned int maxRangeRadius = confparams_cpr->maxRangeRadius;
+	int sampleDistance = confparams_cpr->sampleDistance;
+	double predThreshold = confparams_cpr->predThreshold;
+
+	size_t i;
+	size_t radiusIndex;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, maxRangeRadius*sizeof(size_t));
+
+	double mean_diff;
+	ptrdiff_t freq_index;
+	size_t freq_count = 0;
+	size_t n1_count = 1;
+	size_t offset_count = sampleDistance - 1;
+	size_t offset_count_2 = 0;
+	size_t sample_count = 0;
+	data_pos = oriData + r2 + offset_count;
+	while(data_pos - oriData < len){
+		pred_value = data_pos[-1] + data_pos[-r2] - data_pos[-r2-1];
+		pred_err = fabs(pred_value - *data_pos);
+		if(pred_err < realPrecision) freq_count ++;
+		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+		if(radiusIndex>=maxRangeRadius)
+			radiusIndex = maxRangeRadius - 1;
+		intervals[radiusIndex]++;
+
+		mean_diff = *data_pos - mean;
+		if(mean_diff > 0) freq_index = (ptrdiff_t)(mean_diff/realPrecision) + radius;
+		else freq_index = (ptrdiff_t)(mean_diff/realPrecision) - 1 + radius;
+		if(freq_index <= 0){
+			freq_intervals[0] ++;
+		}
+		else if(freq_index >= range){
+			freq_intervals[range - 1] ++;
+		}
+		else{
+			freq_intervals[freq_index] ++;
+		}
+		offset_count += sampleDistance;
+		if(offset_count >= r2){
+			n1_count ++;
+			offset_count_2 = n1_count % sampleDistance;
+			data_pos += (r2 + sampleDistance - offset_count) + (sampleDistance - offset_count_2);
+			offset_count = (sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += sampleDistance;
+		sample_count ++;
+	}
+	*max_freq = freq_count * 1.0/ sample_count;
+
+	//compute the appropriate number
+	size_t targetCount = sample_count*predThreshold;
+	size_t sum = 0;
+	for(i=0;i<maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=maxRangeRadius)
+		i = maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	// collect frequency
+	size_t max_sum = 0;
+	size_t max_index = 0;
+	size_t tmp_sum;
+	size_t * freq_pos = freq_intervals + 1;
+	for(size_t i=1; i<range-2; i++){
+		tmp_sum = freq_pos[0] + freq_pos[1];
+		if(tmp_sum > max_sum){
+			max_sum = tmp_sum;
+			max_index = i;
+		}
+		freq_pos ++;
+	}
+	*dense_pos = mean + realPrecision * (ptrdiff_t)(max_index + 1 - radius);
+	*mean_freq = max_sum * 1.0 / sample_count;
+
+	free(freq_intervals);
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_3D_with_freq_and_dense_pos(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, double * dense_pos, double * max_freq, double * mean_freq)
+{	
+	double mean = 0.0;
+	size_t len = r1 * r2 * r3;
+	size_t mean_distance = (int) (sqrt(len));
+	double * data_pos = oriData;
+	size_t offset_count = 0;
+	size_t offset_count_2 = 0;
+	size_t mean_count = 0;
+	while(data_pos - oriData < len){
+		mean += *data_pos;
+		mean_count ++;
+		data_pos += mean_distance;
+		offset_count += mean_distance;
+		offset_count_2 += mean_distance;
+		if(offset_count >= r3){
+			offset_count = 0;
+			data_pos -= 1;
+		}
+		if(offset_count_2 >= r2 * r3){
+			offset_count_2 = 0;
+			data_pos -= 1;
+		}
+	}
+	if(mean_count > 0) mean /= mean_count;
+	size_t range = 8192;
+	size_t radius = 4096;
+	size_t * freq_intervals = (size_t *) malloc(range*sizeof(size_t));
+	memset(freq_intervals, 0, range*sizeof(size_t));
+
+	unsigned int maxRangeRadius = confparams_cpr->maxRangeRadius;
+	int sampleDistance = confparams_cpr->sampleDistance;
+	double predThreshold = confparams_cpr->predThreshold;
+
+	size_t i;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, maxRangeRadius*sizeof(size_t));
+
+	double mean_diff;
+	ptrdiff_t freq_index;
+	size_t freq_count = 0;
+	size_t sample_count = 0;
+
+	offset_count = confparams_cpr->sampleDistance - 2; // count r3 offset
+	data_pos = oriData + r23 + r3 + offset_count;
+	size_t n1_count = 1, n2_count = 1; // count i,j sum
+
+	while(data_pos - oriData < len){
+
+		pred_value = data_pos[-1] + data_pos[-r3] + data_pos[-r23] - data_pos[-1-r23] - data_pos[-r3-1] - data_pos[-r3-r23] + data_pos[-r3-r23-1];
+		pred_err = fabs(pred_value - *data_pos);
+		if(pred_err < realPrecision) freq_count ++;
+		radiusIndex = (pred_err/realPrecision+1)/2;
+		if(radiusIndex>=maxRangeRadius)
+		{
+			radiusIndex = maxRangeRadius - 1;
+		}
+		intervals[radiusIndex]++;
+
+		mean_diff = *data_pos - mean;
+		if(mean_diff > 0) freq_index = (ptrdiff_t)(mean_diff/realPrecision) + radius;
+		else freq_index = (ptrdiff_t)(mean_diff/realPrecision) - 1 + radius;
+		if(freq_index <= 0){
+			freq_intervals[0] ++;
+		}
+		else if(freq_index >= range){
+			freq_intervals[range - 1] ++;
+		}
+		else{
+			freq_intervals[freq_index] ++;
+		}
+		offset_count += sampleDistance;
+		if(offset_count >= r3){
+			n2_count ++;
+			if(n2_count == r2){
+				n1_count ++;
+				n2_count = 1;
+				data_pos += r3;
+			}
+			offset_count_2 = (n1_count + n2_count) % sampleDistance;
+			data_pos += (r3 + sampleDistance - offset_count) + (sampleDistance - offset_count_2);
+			offset_count = (sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += sampleDistance;
+		sample_count ++;
+	}	
+	*max_freq = freq_count * 1.0/ sample_count;
+
+	//compute the appropriate number
+	size_t targetCount = sample_count*predThreshold;
+	size_t sum = 0;
+	for(i=0;i<maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=maxRangeRadius)
+		i = maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	// collect frequency
+	size_t max_sum = 0;
+	size_t max_index = 0;
+	size_t tmp_sum;
+	size_t * freq_pos = freq_intervals + 1;
+	for(size_t i=1; i<range-2; i++){
+		tmp_sum = freq_pos[0] + freq_pos[1];
+		if(tmp_sum > max_sum){
+			max_sum = tmp_sum;
+			max_index = i;
+		}
+		freq_pos ++;
+	}
+	*dense_pos = mean + realPrecision * (ptrdiff_t)(max_index + 1 - radius);
+	*mean_freq = max_sum * 1.0 / sample_count;
+
+	free(freq_intervals);
+	free(intervals);
+	return powerOf2;
+}
+
+#define MIN(a, b) a<b? a : b
+unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(double *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size){
+
+	unsigned int quantization_intervals;
+	double sz_sample_correct_freq = -1;//0.5; //-1
+	double dense_pos;
+	double mean_flush_freq;
+	unsigned char use_mean = 0;
+
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_2D_with_freq_and_dense_pos(oriData, r1, r2, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+		if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+
+	// calculate block dims
+	size_t num_x, num_y;
+	size_t block_size = 16;
+
+	SZ_COMPUTE_2D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_2D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+
+	size_t split_index_x, split_index_y;
+	size_t early_blockcount_x, early_blockcount_y;
+	size_t late_blockcount_x, late_blockcount_y;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+
+	size_t max_num_block_elements = early_blockcount_x * early_blockcount_y;
+	size_t num_blocks = num_x * num_y;
+	size_t num_elements = r1 * r2;
+
+	size_t dim0_offset = r2;	
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	double * result_unpredictable_data = (double *) malloc(unpred_data_max_size * sizeof(double) * num_blocks);
+	size_t total_unpred = 0;
+	size_t unpredictable_count;
+	double * data_pos = oriData;
+	int * type = result_type;
+	size_t offset_x, offset_y;
+	size_t current_blockcount_x, current_blockcount_y;
+
+	double * reg_params = (double *) malloc(num_blocks * 4 * sizeof(double));
+	double * reg_params_pos = reg_params;
+	// move regression part out
+	size_t params_offset_b = num_blocks;
+	size_t params_offset_c = 2*num_blocks;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+
+			data_pos = oriData + offset_x * dim0_offset + offset_y;
+
+			{
+				double * cur_data_pos = data_pos;
+				double fx = 0.0;
+				double fy = 0.0;
+				double f = 0;
+				double sum_x; 
+				double curData;
+				for(size_t i=0; i<current_blockcount_x; i++){
+					sum_x = 0;
+					for(size_t j=0; j<current_blockcount_y; j++){
+						curData = *cur_data_pos;
+						sum_x += curData;
+						fy += curData * j;
+						cur_data_pos ++;
+					}
+					fx += sum_x * i;
+					f += sum_x;
+					cur_data_pos += dim0_offset - current_blockcount_y;
+				}
+				double coeff = 1.0 / (current_blockcount_x * current_blockcount_y);
+				reg_params_pos[0] = (2 * fx / (current_blockcount_x - 1) - f) * 6 * coeff / (current_blockcount_x + 1);
+				reg_params_pos[params_offset_b] = (2 * fy / (current_blockcount_y - 1) - f) * 6 * coeff / (current_blockcount_y + 1);
+				reg_params_pos[params_offset_c] = f * coeff - ((current_blockcount_x - 1) * reg_params_pos[0] / 2 + (current_blockcount_y - 1) * reg_params_pos[params_offset_b] / 2);
+			}
+
+			reg_params_pos ++;
+		}
+	}
+
+	//Compress coefficient arrays
+	double precision_a, precision_b, precision_c;
+	double rel_param_err = 0.15/3;
+	precision_a = rel_param_err * realPrecision / late_blockcount_x;
+	precision_b = rel_param_err * realPrecision / late_blockcount_y;
+	precision_c = rel_param_err * realPrecision;
+
+	double mean = 0;
+	use_mean = 0;
+	if(use_mean){
+		// compute mean
+		double sum = 0.0;
+		size_t mean_count = 0;
+		for(size_t i=0; i<num_elements; i++){
+			if(fabs(oriData[i] - dense_pos) < realPrecision){
+				sum += oriData[i];
+				mean_count ++;
+			}
+		}
+		if(mean_count > 0) mean = sum / mean_count;
+	}
+
+
+	double tmp_realPrecision = realPrecision;
+
+	// use two prediction buffers for higher performance
+	double * unpredictable_data = result_unpredictable_data;
+	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+	memset(indicator, 0, num_blocks * sizeof(unsigned char));
+	size_t reg_count = 0;
+	size_t strip_dim_0 = early_blockcount_x + 1;
+	size_t strip_dim_1 = r2 + 1;
+	size_t strip_dim0_offset = strip_dim_1;
+	unsigned char * indicator_pos = indicator;
+	size_t prediction_buffer_size = strip_dim_0 * strip_dim0_offset * sizeof(double);
+	double * prediction_buffer_1 = (double *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_1, 0, prediction_buffer_size);
+	double * prediction_buffer_2 = (double *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_2, 0, prediction_buffer_size);
+	double * cur_pb_buf = prediction_buffer_1;
+	double * next_pb_buf = prediction_buffer_2;
+	double * cur_pb_buf_pos;
+	double * next_pb_buf_pos;
+	int intvCapacity = exe_params->intvCapacity;
+	int intvRadius = exe_params->intvRadius;
+	int use_reg = 0;
+
+	reg_params_pos = reg_params;
+	// compress the regression coefficients on the fly
+	double last_coeffcients[3] = {0.0};
+	int coeff_intvCapacity_sz = 65536;
+	int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+	int * coeff_type[3];
+	int * coeff_result_type = (int *) malloc(num_blocks*3*sizeof(int));
+	double * coeff_unpred_data[3];
+	double * coeff_unpredictable_data = (double *) malloc(num_blocks*3*sizeof(double));
+	double precision[3];
+	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c;
+	for(int i=0; i<3; i++){
+		coeff_type[i] = coeff_result_type + i * num_blocks;
+		coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
+	}
+	int coeff_index = 0;
+	unsigned int coeff_unpredictable_count[3] = {0};
+	if(use_mean){
+		type = result_type;
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			data_pos = oriData + offset_x * dim0_offset;
+
+			cur_pb_buf_pos = cur_pb_buf + strip_dim0_offset + 1;
+			next_pb_buf_pos = next_pb_buf + 1;
+			double * pb_pos = cur_pb_buf_pos;
+			double * next_pb_pos = next_pb_buf_pos;
+
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				
+				/*sampling: decide which predictor to use (regression or lorenzo)*/
+				{
+					double * cur_data_pos;
+					double curData;
+					double pred_reg, pred_sz;
+					double err_sz = 0.0, err_reg = 0.0;
+					// [1, 1] [3, 3] [5, 5] [7, 7] [9, 9]
+					// [1, 9] [3, 7]		[7, 3] [9, 1]
+					int count = 0;
+					for(int i=1; i<current_blockcount_x; i+=2){
+						cur_data_pos = data_pos + i * dim0_offset + i;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c];
+						
+						err_sz += MIN(fabs(pred_sz - curData) + realPrecision*0.81, fabs(mean - curData));
+
+						err_reg += fabs(pred_reg - curData);
+
+						cur_data_pos = data_pos + i * dim0_offset + (block_size - i);
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * (block_size - i) + reg_params_pos[params_offset_c];
+						err_sz += MIN(fabs(pred_sz - curData) + realPrecision*0.81, fabs(mean - curData));
+						
+						err_reg += fabs(pred_reg - curData);
+
+						count += 2;
+					}
+
+					use_reg = (err_reg < err_sz);
+				}
+				if(use_reg)
+				{
+					{
+						/*predict coefficients in current block via previous reg_block*/
+						double cur_coeff;
+						double diff, itvNum;
+						for(int e=0; e<3; e++){
+							cur_coeff = reg_params_pos[e*num_blocks];
+							diff = cur_coeff - last_coeffcients[e];
+							itvNum = fabs(diff)/precision[e] + 1;
+							if (itvNum < coeff_intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+								last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;	
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}					
+							}
+							else{
+								coeff_type[e][coeff_index] = 0;
+								last_coeffcients[e] = cur_coeff;
+								coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+							}
+						}
+						coeff_index ++;
+					}
+					double curData;
+					double pred;
+					double itvNum;
+					double diff;
+					size_t index = 0;
+					size_t block_unpredictable_count = 0;
+					double * cur_data_pos = data_pos;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						size_t ii = current_blockcount_x - 1;
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+
+							index ++;	
+							cur_data_pos ++;
+						}
+					} // end ii == -1
+					unpredictable_count = block_unpredictable_count;
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;					
+					reg_count ++;
+				}// end use_reg
+				else{
+					// use SZ
+					// SZ predication
+					unpredictable_count = 0;
+					double * cur_pb_pos = pb_pos;
+					double * cur_data_pos = data_pos;
+					double curData;
+					double pred2D;
+					double itvNum, diff;
+					size_t index = 0;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+							if(fabs(curData - mean) <= realPrecision){
+								// adjust type[index] to intvRadius for coherence with freq in reg
+								type[index] = intvRadius;
+								*cur_pb_pos = mean;
+							}
+							else
+							{
+								pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+								diff = curData - pred2D;
+								itvNum = fabs(diff)/realPrecision + 1;
+								if (itvNum < intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									type[index] = (int) (itvNum/2) + intvRadius;
+									*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+									if(type[index] <= intvRadius) type[index] -= 1;
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+										type[index] = 0;
+										*cur_pb_pos = curData;	
+										unpredictable_data[unpredictable_count ++] = curData;
+									}					
+								}
+								else{
+									type[index] = 0;
+									*cur_pb_pos = curData;
+									unpredictable_data[unpredictable_count ++] = curData;
+								}
+							}
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+						cur_pb_pos += strip_dim0_offset - current_blockcount_y;
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						// ii == current_blockcount_x - 1
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+							if(fabs(curData - mean) <= realPrecision){
+								// adjust type[index] to intvRadius for coherence with freq in reg
+								type[index] = intvRadius;
+								*cur_pb_pos = mean;
+							}
+							else
+							{
+								pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+								diff = curData - pred2D;
+								itvNum = fabs(diff)/realPrecision + 1;
+								if (itvNum < intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									type[index] = (int) (itvNum/2) + intvRadius;
+									*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+									if(type[index] <= intvRadius) type[index] -= 1;
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+										type[index] = 0;
+										*cur_pb_pos = curData;	
+										unpredictable_data[unpredictable_count ++] = curData;
+									}					
+								}
+								else{
+									type[index] = 0;
+									*cur_pb_pos = curData;
+									unpredictable_data[unpredictable_count ++] = curData;
+								}
+							}
+							next_pb_pos[jj] = *cur_pb_pos;
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+					}
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;
+					// change indicator
+					indicator_pos[j] = 1;
+				}// end SZ
+				reg_params_pos ++;
+				data_pos += current_blockcount_y;
+				pb_pos += current_blockcount_y;
+				next_pb_pos += current_blockcount_y;
+				type += current_blockcount_x * current_blockcount_y;
+			}// end j
+			indicator_pos += num_y;
+			double * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}// end i
+	}// end use mean
+	else{
+		type = result_type;
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			data_pos = oriData + offset_x * dim0_offset;
+
+			cur_pb_buf_pos = cur_pb_buf + strip_dim0_offset + 1;
+			next_pb_buf_pos = next_pb_buf + 1;
+			double * pb_pos = cur_pb_buf_pos;
+			double * next_pb_pos = next_pb_buf_pos;
+
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				/*sampling*/
+				{
+					// sample [2i + 1, 2i + 1] [2i + 1, bs - 2i]
+					double * cur_data_pos;
+					double curData;
+					double pred_reg, pred_sz;
+					double err_sz = 0.0, err_reg = 0.0;
+					// [1, 1] [3, 3] [5, 5] [7, 7] [9, 9]
+					// [1, 9] [3, 7]		[7, 3] [9, 1]
+					int count = 0;
+					for(int i=1; i<current_blockcount_x; i+=2){
+						cur_data_pos = data_pos + i * dim0_offset + i;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c];
+						err_sz += fabs(pred_sz - curData);
+						err_reg += fabs(pred_reg - curData);
+
+						cur_data_pos = data_pos + i * dim0_offset + (block_size - i);
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * (block_size - i) + reg_params_pos[params_offset_c];
+						err_sz += fabs(pred_sz - curData);
+						err_reg += fabs(pred_reg - curData);
+
+						count += 2;
+					}
+					err_sz += realPrecision * count * 0.81;
+					use_reg = (err_reg < err_sz);
+
+				}
+				if(use_reg)
+				{
+					{
+						/*predict coefficients in current block via previous reg_block*/
+						double cur_coeff;
+						double diff, itvNum;
+						for(int e=0; e<3; e++){
+							cur_coeff = reg_params_pos[e*num_blocks];
+							diff = cur_coeff - last_coeffcients[e];
+							itvNum = fabs(diff)/precision[e] + 1;
+							if (itvNum < coeff_intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+								last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;	
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}					
+							}
+							else{
+								coeff_type[e][coeff_index] = 0;
+								last_coeffcients[e] = cur_coeff;
+								coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+							}
+						}
+						coeff_index ++;
+					}
+					double curData;
+					double pred;
+					double itvNum;
+					double diff;
+					size_t index = 0;
+					size_t block_unpredictable_count = 0;
+					double * cur_data_pos = data_pos;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							// jj == current_blockcount_y - 1
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						size_t ii = current_blockcount_x - 1;
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							// jj == current_blockcount_y - 1
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+
+							index ++;	
+							cur_data_pos ++;
+						}
+					} // end ii == -1
+					unpredictable_count = block_unpredictable_count;
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;					
+					reg_count ++;
+				}// end use_reg
+				else{
+					// use SZ
+					// SZ predication
+					unpredictable_count = 0;
+					double * cur_pb_pos = pb_pos;
+					double * cur_data_pos = data_pos;
+					double curData;
+					double pred2D;
+					double itvNum, diff;
+					size_t index = 0;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+
+							pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+							diff = curData - pred2D;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+									type[index] = 0;
+									*cur_pb_pos = curData;	
+									unpredictable_data[unpredictable_count ++] = curData;
+								}					
+							}
+							else{
+								type[index] = 0;
+								*cur_pb_pos = curData;
+								unpredictable_data[unpredictable_count ++] = curData;
+							}
+
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+						cur_pb_pos += strip_dim0_offset - current_blockcount_y;
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						// ii == current_blockcount_x - 1
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+
+							pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+							diff = curData - pred2D;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+									type[index] = 0;
+									*cur_pb_pos = curData;	
+									unpredictable_data[unpredictable_count ++] = curData;
+								}					
+							}
+							else{
+								type[index] = 0;
+								*cur_pb_pos = curData;
+								unpredictable_data[unpredictable_count ++] = curData;
+							}
+							next_pb_pos[jj] = *cur_pb_pos;
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+					}
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;
+					// change indicator
+					indicator_pos[j] = 1;
+				}// end SZ
+				reg_params_pos ++;
+				data_pos += current_blockcount_y;
+				pb_pos += current_blockcount_y;
+				next_pb_pos += current_blockcount_y;
+				type += current_blockcount_x * current_blockcount_y;
+			}// end j
+			indicator_pos += num_y;
+			double * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}// end i		
+	}
+	free(prediction_buffer_1);
+	free(prediction_buffer_2);
+
+	int stateNum = 2*quantization_intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	size_t nodeCount = 0;
+	size_t i = 0;
+	init(huffmanTree, result_type, num_elements);
+	for (i = 0; i < stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	// total size 										metadata		  # elements   real precision		intervals	nodeCount		huffman 	 	block index 						unpredicatable count						mean 					 	unpred size 				elements
+	unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + treeByteSize + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(double) + total_unpred * sizeof(double) + num_elements * sizeof(int), 1);
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	result_pos += meta_data_offset;
+
+	sizeToBytes(result_pos, num_elements);
+	result_pos += exe_params->SZ_SIZE_TYPE;
+	
+	intToBytes_bigEndian(result_pos, block_size);
+	result_pos += sizeof(int);
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += sizeof(double);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += sizeof(int);
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+	free(treeBytes);
+
+	memcpy(result_pos, &use_mean, sizeof(unsigned char));
+	result_pos += sizeof(unsigned char);
+	memcpy(result_pos, &mean, sizeof(double));
+	result_pos += sizeof(double);
+
+	size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+	result_pos += indicator_size;
+	
+	//convert the lead/mid/resi to byte stream 	
+	if(reg_count>0){
+		for(int e=0; e<3; e++){
+			int stateNum = 2*coeff_intvCapacity_sz;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+			size_t nodeCount = 0;
+			init(huffmanTree, coeff_type[e], reg_count);
+			size_t i = 0;
+			for (i = 0; i < huffmanTree->stateNum; i++)
+				if (huffmanTree->code[i]) nodeCount++; 
+			nodeCount = nodeCount*2-1;
+			unsigned char *treeBytes;
+			unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+			doubleToBytes(result_pos, precision[e]);
+			result_pos += sizeof(double);
+			intToBytes_bigEndian(result_pos, coeff_intvRadius);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, treeByteSize);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, nodeCount);
+			result_pos += sizeof(int);
+			memcpy(result_pos, treeBytes, treeByteSize);		
+			result_pos += treeByteSize;
+			free(treeBytes);
+			size_t typeArray_size = 0;
+			encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+			sizeToBytes(result_pos, typeArray_size);
+			result_pos += sizeof(size_t) + typeArray_size;
+			intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+			result_pos += sizeof(int);
+			memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(double));
+			result_pos += coeff_unpredictable_count[e]*sizeof(double);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	free(coeff_result_type);
+	free(coeff_unpredictable_data);
+
+	//record the number of unpredictable data and also store them
+	memcpy(result_pos, &total_unpred, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(double));
+	result_pos += total_unpred * sizeof(double);
+	size_t typeArray_size = 0;
+	encode(huffmanTree, result_type, num_elements, result_pos, &typeArray_size);
+	result_pos += typeArray_size;
+
+	size_t totalEncodeSize = result_pos - result;
+	free(indicator);
+	free(result_unpredictable_data);
+	free(result_type);
+	free(reg_params);
+	
+	SZ_ReleaseHuffman(huffmanTree);
+	*comp_size = totalEncodeSize;
+
+	return result;
+}
+
+unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size){
+
+	unsigned int quantization_intervals;
+	double sz_sample_correct_freq = -1;//0.5; //-1
+	double dense_pos;
+	double mean_flush_freq;
+	unsigned char use_mean = 0;
+
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	size_t block_size = 6;
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r3, num_z, block_size);
+
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t max_num_block_elements = early_blockcount_x * early_blockcount_y * early_blockcount_z;
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t num_elements = r1 * r2 * r3;
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;	
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	double * result_unpredictable_data = (double *) malloc(unpred_data_max_size * sizeof(double) * num_blocks);
+	size_t total_unpred = 0;
+	size_t unpredictable_count;
+	size_t max_unpred_count = 0;
+	double * data_pos = oriData;
+	int * type = result_type;
+	size_t type_offset;
+	size_t offset_x, offset_y, offset_z;
+	size_t current_blockcount_x, current_blockcount_y, current_blockcount_z;
+
+	double * reg_params = (double *) malloc(num_blocks * 4 * sizeof(double));
+	double * reg_params_pos = reg_params;
+	// move regression part out
+	size_t params_offset_b = num_blocks;
+	size_t params_offset_c = 2*num_blocks;
+	size_t params_offset_d = 3*num_blocks;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			for(size_t k=0; k<num_z; k++){
+				current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+				offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+	
+				data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+				/*Calculate regression coefficients*/
+				{
+					double * cur_data_pos = data_pos;
+					double fx = 0.0;
+					double fy = 0.0;
+					double fz = 0.0;
+					double f = 0;
+					double sum_x, sum_y; 
+					double curData;
+					for(size_t i=0; i<current_blockcount_x; i++){
+						sum_x = 0;
+						for(size_t j=0; j<current_blockcount_y; j++){
+							sum_y = 0;
+							for(size_t k=0; k<current_blockcount_z; k++){
+								curData = *cur_data_pos;
+								// f += curData;
+								// fx += curData * i;
+								// fy += curData * j;
+								// fz += curData * k;
+								sum_y += curData;
+								fz += curData * k;
+								cur_data_pos ++;
+							}
+							fy += sum_y * j;
+							sum_x += sum_y;
+							cur_data_pos += dim1_offset - current_blockcount_z;
+						}
+						fx += sum_x * i;
+						f += sum_x;
+						cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+					}
+					double coeff = 1.0 / (current_blockcount_x * current_blockcount_y * current_blockcount_z);
+					reg_params_pos[0] = (2 * fx / (current_blockcount_x - 1) - f) * 6 * coeff / (current_blockcount_x + 1);
+					reg_params_pos[params_offset_b] = (2 * fy / (current_blockcount_y - 1) - f) * 6 * coeff / (current_blockcount_y + 1);
+					reg_params_pos[params_offset_c] = (2 * fz / (current_blockcount_z - 1) - f) * 6 * coeff / (current_blockcount_z + 1);
+					reg_params_pos[params_offset_d] = f * coeff - ((current_blockcount_x - 1) * reg_params_pos[0] / 2 + (current_blockcount_y - 1) * reg_params_pos[params_offset_b] / 2 + (current_blockcount_z - 1) * reg_params_pos[params_offset_c] / 2);
+				}
+				reg_params_pos ++;
+			}
+		}
+	}
+	
+	//Compress coefficient arrays
+	double precision_a, precision_b, precision_c, precision_d;
+	double rel_param_err = 0.025;
+	precision_a = rel_param_err * realPrecision / late_blockcount_x;
+	precision_b = rel_param_err * realPrecision / late_blockcount_y;
+	precision_c = rel_param_err * realPrecision / late_blockcount_z;
+	precision_d = rel_param_err * realPrecision;
+
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_3D_with_freq_and_dense_pos(oriData, r1, r2, r3, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+		if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+
+	double mean = 0;
+	if(use_mean){
+		// compute mean
+		double sum = 0.0;
+		size_t mean_count = 0;
+		for(size_t i=0; i<num_elements; i++){
+			if(fabs(oriData[i] - dense_pos) < realPrecision){
+				sum += oriData[i];
+				mean_count ++;
+			}
+		}
+		if(mean_count > 0) mean = sum / mean_count;
+	}
+
+	double tmp_realPrecision = realPrecision;
+
+	// use two prediction buffers for higher performance
+	double * unpredictable_data = result_unpredictable_data;
+	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+	memset(indicator, 0, num_blocks * sizeof(unsigned char));
+	size_t reg_count = 0;
+	size_t strip_dim_0 = early_blockcount_x + 1;
+	size_t strip_dim_1 = r2 + 1;
+	size_t strip_dim_2 = r3 + 1;
+	size_t strip_dim0_offset = strip_dim_1 * strip_dim_2;
+	size_t strip_dim1_offset = strip_dim_2;
+	unsigned char * indicator_pos = indicator;
+
+	size_t prediction_buffer_size = strip_dim_0 * strip_dim0_offset * sizeof(double);
+	double * prediction_buffer_1 = (double *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_1, 0, prediction_buffer_size);
+	double * prediction_buffer_2 = (double *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_2, 0, prediction_buffer_size);
+	double * cur_pb_buf = prediction_buffer_1;
+	double * next_pb_buf = prediction_buffer_2;
+	double * cur_pb_buf_pos;
+	double * next_pb_buf_pos;
+	int intvCapacity = exe_params->intvCapacity;
+	int intvRadius = exe_params->intvRadius;	
+	int use_reg = 0;
+	double noise = realPrecision * 1.22;
+
+	reg_params_pos = reg_params;
+	// compress the regression coefficients on the fly
+	double last_coeffcients[4] = {0.0};
+	int coeff_intvCapacity_sz = 65536;
+	int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+	int * coeff_type[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	double * coeff_unpred_data[4];
+	double * coeff_unpredictable_data = (double *) malloc(num_blocks*4*sizeof(double));
+	double precision[4];
+	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c, precision[3] = precision_d;
+	for(int i=0; i<4; i++){
+		coeff_type[i] = coeff_result_type + i * num_blocks;
+		coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
+	}
+	int coeff_index = 0;
+	unsigned int coeff_unpredictable_count[4] = {0};
+
+	if(use_mean){
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset;
+				type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset;
+				type = result_type + type_offset;
+
+				// prediction buffer is (current_block_count_x + 1) * (current_block_count_y + 1) * (current_block_count_z + 1)
+				cur_pb_buf_pos = cur_pb_buf + offset_y * strip_dim1_offset + strip_dim0_offset + strip_dim1_offset + 1;
+				next_pb_buf_pos = next_pb_buf + offset_y * strip_dim1_offset + strip_dim1_offset + 1;
+
+				size_t current_blockcount_z;
+				double * pb_pos = cur_pb_buf_pos;
+				double * next_pb_pos = next_pb_buf_pos;
+				size_t strip_unpredictable_count = 0;
+				for(size_t k=0; k<num_z; k++){
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					/*sampling and decide which predictor*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						double * cur_data_pos;
+						double curData;
+						double pred_reg, pred_sz;
+						double err_sz = 0.0, err_reg = 0.0;
+						int bmi = 0;
+						if(i>0 && j>0 && k>0){
+							for(int i=0; i<block_size; i++){
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);
+
+								bmi = block_size - i;
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);
+							}
+						}
+						else{
+							for(int i=1; i<block_size; i++){
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);
+
+								bmi = block_size - i;
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);								
+
+							}
+						}
+						use_reg = (err_reg < err_sz);
+					}
+					if(use_reg){
+						{
+							/*predict coefficients in current block via previous reg_block*/
+							double cur_coeff;
+							double diff, itvNum;
+							for(int e=0; e<4; e++){
+								cur_coeff = reg_params_pos[e*num_blocks];
+								diff = cur_coeff - last_coeffcients[e];
+								itvNum = fabs(diff)/precision[e] + 1;
+								if (itvNum < coeff_intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+									last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+										coeff_type[e][coeff_index] = 0;
+										last_coeffcients[e] = cur_coeff;	
+										coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+									}					
+								}
+								else{
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}
+							}
+							coeff_index ++;
+						}
+						double curData;
+						double pred;
+						double itvNum;
+						double diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						double * cur_data_pos = data_pos;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							size_t ii = current_blockcount_x - 1;
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									// assign value to next prediction buffer
+									next_pb_pos[jj * strip_dim1_offset + kk] = pred;
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						unpredictable_count = block_unpredictable_count;
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						
+						reg_count ++;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						double * cur_pb_pos = pb_pos;
+						double * cur_data_pos = data_pos;
+						double curData;
+						double pred3D;
+						double itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									if(fabs(curData - mean) <= realPrecision){
+										// adjust type[index] to intvRadius for coherence with freq in reg
+										type[index] = intvRadius;
+										*cur_pb_pos = mean;
+									}
+									else
+									{
+										pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+												 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+										diff = curData - pred3D;
+										itvNum = fabs(diff)/realPrecision + 1;
+										if (itvNum < intvCapacity_sz){
+											if (diff < 0) itvNum = -itvNum;
+											type[index] = (int) (itvNum/2) + intvRadius;
+											*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+											if(type[index] <= intvRadius) type[index] -= 1;
+											//ganrantee comporession error against the case of machine-epsilon
+											if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+												type[index] = 0;
+												*cur_pb_pos = curData;	
+												unpredictable_data[unpredictable_count ++] = curData;
+											}					
+										}
+										else{
+											type[index] = 0;
+											*cur_pb_pos = curData;
+											unpredictable_data[unpredictable_count ++] = curData;
+										}
+									}
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_pb_pos += strip_dim0_offset - current_blockcount_y * strip_dim1_offset;
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									if(fabs(curData - mean) <= realPrecision){
+										// adjust type[index] to intvRadius for coherence with freq in reg
+										type[index] = intvRadius;
+										*cur_pb_pos = mean;
+									}
+									else
+									{
+										pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+												 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+										diff = curData - pred3D;
+										itvNum = fabs(diff)/realPrecision + 1;
+										if (itvNum < intvCapacity_sz){
+											if (diff < 0) itvNum = -itvNum;
+											type[index] = (int) (itvNum/2) + intvRadius;
+											*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+											if(type[index] <= intvRadius) type[index] -= 1;
+											//ganrantee comporession error against the case of machine-epsilon
+											if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+												type[index] = 0;
+												*cur_pb_pos = curData;	
+												unpredictable_data[unpredictable_count ++] = curData;
+											}					
+										}
+										else{
+											type[index] = 0;
+											*cur_pb_pos = curData;
+											unpredictable_data[unpredictable_count ++] = curData;
+										}
+									}
+									next_pb_pos[jj * strip_dim1_offset + kk] = *cur_pb_pos;
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						// change indicator
+						indicator_pos[k] = 1;
+					}// end SZ
+					
+					reg_params_pos ++;
+					data_pos += current_blockcount_z;
+					pb_pos += current_blockcount_z;
+					next_pb_pos += current_blockcount_z;
+					type += current_blockcount_x * current_blockcount_y * current_blockcount_z;
+
+				} // end k
+
+				if(strip_unpredictable_count > max_unpred_count){
+					max_unpred_count = strip_unpredictable_count;
+				}
+				total_unpred += strip_unpredictable_count;
+				indicator_pos += num_z;
+			}// end j
+			double * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}// end i
+	}
+	else{
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset;
+				// copy bottom plane from plane buffer
+				// memcpy(prediction_buffer, bottom_buffer + offset_y * strip_dim1_offset, (current_blockcount_y + 1) * strip_dim1_offset * sizeof(double));
+				type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset;
+				type = result_type + type_offset;
+
+				// prediction buffer is (current_block_count_x + 1) * (current_block_count_y + 1) * (current_block_count_z + 1)
+				cur_pb_buf_pos = cur_pb_buf + offset_y * strip_dim1_offset + strip_dim0_offset + strip_dim1_offset + 1;
+				next_pb_buf_pos = next_pb_buf + offset_y * strip_dim1_offset + strip_dim1_offset + 1;
+
+				size_t current_blockcount_z;
+				double * pb_pos = cur_pb_buf_pos;
+				double * next_pb_pos = next_pb_buf_pos;
+				size_t strip_unpredictable_count = 0;
+				for(size_t k=0; k<num_z; k++){
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+					/*sampling*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						double * cur_data_pos;
+						double curData;
+						double pred_reg, pred_sz;
+						double err_sz = 0.0, err_reg = 0.0;
+						int bmi;
+						if(i>0 && j>0 && k>0){
+							for(int i=0; i<block_size; i++){
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);
+
+								bmi = block_size - i;
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);
+							}
+						}
+						else{
+							for(int i=1; i<block_size; i++){
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);
+
+								bmi = block_size - i;
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);
+							}
+						}
+						use_reg = (err_reg < err_sz);
+
+					}
+					if(use_reg)
+					{
+						{
+							/*predict coefficients in current block via previous reg_block*/
+							double cur_coeff;
+							double diff, itvNum;
+							for(int e=0; e<4; e++){
+								cur_coeff = reg_params_pos[e*num_blocks];
+								diff = cur_coeff - last_coeffcients[e];
+								itvNum = fabs(diff)/precision[e] + 1;
+								if (itvNum < coeff_intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+									last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+										coeff_type[e][coeff_index] = 0;
+										last_coeffcients[e] = cur_coeff;	
+										coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+									}					
+								}
+								else{
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}
+							}
+							coeff_index ++;
+						}
+						double curData;
+						double pred;
+						double itvNum;
+						double diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						double * cur_data_pos = data_pos;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							size_t ii = current_blockcount_x - 1;
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									// assign value to next prediction buffer
+									next_pb_pos[jj * strip_dim1_offset + kk] = pred;
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						unpredictable_count = block_unpredictable_count;
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;						
+						reg_count ++;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						double * cur_pb_pos = pb_pos;
+						double * cur_data_pos = data_pos;
+						double curData;
+						double pred3D;
+						double itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+											 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+									diff = curData - pred3D;
+									itvNum = fabs(diff)/realPrecision + 1;
+									if (itvNum < intvCapacity_sz){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+											type[index] = 0;
+											*cur_pb_pos = curData;	
+											unpredictable_data[unpredictable_count ++] = curData;
+										}					
+									}
+									else{
+										type[index] = 0;
+										*cur_pb_pos = curData;
+										unpredictable_data[unpredictable_count ++] = curData;
+									}
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_pb_pos += strip_dim0_offset - current_blockcount_y * strip_dim1_offset;
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+											 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+									diff = curData - pred3D;
+									itvNum = fabs(diff)/realPrecision + 1;
+									if (itvNum < intvCapacity_sz){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+											type[index] = 0;
+											*cur_pb_pos = curData;	
+											unpredictable_data[unpredictable_count ++] = curData;
+										}					
+									}
+									else{
+										type[index] = 0;
+										*cur_pb_pos = curData;
+										unpredictable_data[unpredictable_count ++] = curData;
+									}
+									// assign value to next prediction buffer
+									next_pb_pos[jj * strip_dim1_offset + kk] = *cur_pb_pos;
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						// change indicator
+						indicator_pos[k] = 1;
+					}// end SZ
+					
+					reg_params_pos ++;
+					data_pos += current_blockcount_z;
+					pb_pos += current_blockcount_z;
+					next_pb_pos += current_blockcount_z;
+					type += current_blockcount_x * current_blockcount_y * current_blockcount_z;
+
+				}
+
+				if(strip_unpredictable_count > max_unpred_count){
+					max_unpred_count = strip_unpredictable_count;
+				}
+				total_unpred += strip_unpredictable_count;
+				indicator_pos += num_z;
+			}
+			double * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}
+	}
+
+	free(prediction_buffer_1);
+	free(prediction_buffer_2);
+
+	int stateNum = 2*quantization_intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	size_t nodeCount = 0;
+	init(huffmanTree, result_type, num_elements);
+	size_t i = 0;
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	// total size 										metadata		  # elements     real precision		intervals	nodeCount		huffman 	 	block index 						unpredicatable count						mean 					 	unpred size 				elements
+	unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + treeByteSize + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(double) + total_unpred * sizeof(double) + num_elements * sizeof(int), 1);
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	
+	result_pos += meta_data_offset;
+	
+	sizeToBytes(result_pos,num_elements); //SZ_SIZE_TYPE: 4 or 8
+	result_pos += exe_params->SZ_SIZE_TYPE;
+
+	intToBytes_bigEndian(result_pos, block_size);
+	result_pos += sizeof(int);
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += sizeof(double);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += sizeof(int);
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+	free(treeBytes);
+
+	memcpy(result_pos, &use_mean, sizeof(unsigned char));
+	result_pos += sizeof(unsigned char);
+	memcpy(result_pos, &mean, sizeof(double));
+	result_pos += sizeof(double);
+	size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+	result_pos += indicator_size;
+	
+	//convert the lead/mid/resi to byte stream
+	if(reg_count > 0){
+		for(int e=0; e<4; e++){
+			int stateNum = 2*coeff_intvCapacity_sz;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+			size_t nodeCount = 0;
+			init(huffmanTree, coeff_type[e], reg_count);
+			size_t i = 0;
+			for (i = 0; i < huffmanTree->stateNum; i++)
+				if (huffmanTree->code[i]) nodeCount++; 
+			nodeCount = nodeCount*2-1;
+			unsigned char *treeBytes;
+			unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+			doubleToBytes(result_pos, precision[e]);
+			result_pos += sizeof(double);
+			intToBytes_bigEndian(result_pos, coeff_intvRadius);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, treeByteSize);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, nodeCount);
+			result_pos += sizeof(int);
+			memcpy(result_pos, treeBytes, treeByteSize);		
+			result_pos += treeByteSize;
+			free(treeBytes);
+			size_t typeArray_size = 0;
+			encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+			sizeToBytes(result_pos, typeArray_size);
+			result_pos += sizeof(size_t) + typeArray_size;
+			intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+			result_pos += sizeof(int);
+			memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(double));
+			result_pos += coeff_unpredictable_count[e]*sizeof(double);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	free(coeff_result_type);
+	free(coeff_unpredictable_data);
+	
+	//record the number of unpredictable data and also store them
+	memcpy(result_pos, &total_unpred, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(double));
+	result_pos += total_unpred * sizeof(double);
+	size_t typeArray_size = 0;
+	encode(huffmanTree, result_type, num_elements, result_pos, &typeArray_size);
+	result_pos += typeArray_size;
+	size_t totalEncodeSize = result_pos - result;
+	free(indicator);
+	free(result_unpredictable_data);
+	free(result_type);
+	free(reg_params);
+
+	
+	SZ_ReleaseHuffman(huffmanTree);
+	*comp_size = totalEncodeSize;
+	return result;
+}
diff --git a/thirdparty/SZ/sz/src/sz_double_pwr.c b/thirdparty/SZ/sz/src/sz_double_pwr.c
index 59be38cc48fa85bfc9ce147c2307beeb145fcc27..881d4952281dc2a5ad0a9bf10fbecb846d270394 100644
--- a/thirdparty/SZ/sz/src/sz_double_pwr.c
+++ b/thirdparty/SZ/sz/src/sz_double_pwr.c
@@ -23,6 +23,7 @@
 #include "sz_double_pwr.h"
 #include "zlib.h"
 #include "rw.h"
+#include "utility.h"
 
 void compute_segment_precisions_double_1D(double *oriData, size_t dataLength, double* pwrErrBound, unsigned char* pwrErrBoundBytes, double globalPrecision)
 {
@@ -1772,3 +1773,188 @@ size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio,
 
         free_TightDataPointStorageD(tdps);
 }
+
+#include <stdbool.h>
+
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_pre_log(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, double min, double max){
+
+	double * log_data = (double *) malloc(dataLength * sizeof(double));
+
+	unsigned char * signs = (unsigned char *) malloc(dataLength);
+	memset(signs, 0, dataLength);
+	// preprocess
+	double max_abs_log_data;
+    if(min == 0) max_abs_log_data = fabs(log2(fabs(max)));
+    else if(max == 0) max_abs_log_data = fabs(log2(fabs(min)));
+    else max_abs_log_data = fabs(log2(fabs(min))) > fabs(log2(fabs(max))) ? fabs(log2(fabs(min))) : fabs(log2(fabs(max)));
+    double min_log_data = max_abs_log_data;
+	bool positive = true;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] < 0){
+			signs[i] = 1;
+			log_data[i] = -oriData[i];
+			positive = false;
+		}
+		else
+			log_data[i] = oriData[i];
+		if(log_data[i] > 0){
+			log_data[i] = log2(log_data[i]);
+			if(log_data[i] > max_abs_log_data) max_abs_log_data = log_data[i];
+			if(log_data[i] < min_log_data) min_log_data = log_data[i];
+		}
+	}
+
+	double valueRangeSize, medianValue_f;
+	computeRangeSize_double(log_data, dataLength, &valueRangeSize, &medianValue_f);	
+	if(fabs(min_log_data) > max_abs_log_data) max_abs_log_data = fabs(min_log_data);
+	double realPrecision = log2(1.0 + pwrErrRatio) - max_abs_log_data * 2.23e-16;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			log_data[i] = min_log_data - 2.0001*realPrecision;
+		}
+	}
+    TightDataPointStorageD* tdps = SZ_compress_double_1D_MDQ(log_data, dataLength, realPrecision, valueRangeSize, medianValue_f);
+    tdps->minLogValue = min_log_data - 1.0001*realPrecision;
+    free(log_data);
+    if(!positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+    if(*outSize>dataLength*sizeof(double))
+            SZ_compress_args_double_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+
+    free_TightDataPointStorageD(tdps);
+}
+
+void SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr_pre_log(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t *outSize, double min, double max){
+
+	size_t dataLength = r1 * r2;
+	double * log_data = (double *) malloc(dataLength * sizeof(double));
+
+	unsigned char * signs = (unsigned char *) malloc(dataLength);
+	memset(signs, 0, dataLength);
+	// preprocess
+	double max_abs_log_data;
+    if(min == 0) max_abs_log_data = fabs(log2(fabs(max)));
+    else if(max == 0) max_abs_log_data = fabs(log2(fabs(min)));
+    else max_abs_log_data = fabs(log2(fabs(min))) > fabs(log2(fabs(max))) ? fabs(log2(fabs(min))) : fabs(log2(fabs(max)));
+    double min_log_data = max_abs_log_data;
+	bool positive = true;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] < 0){
+			signs[i] = 1;
+			log_data[i] = -oriData[i];
+			positive = false;
+		}
+		else
+			log_data[i] = oriData[i];
+		if(log_data[i] > 0){
+			log_data[i] = log2(log_data[i]);
+			if(log_data[i] > max_abs_log_data) max_abs_log_data = log_data[i];
+			if(log_data[i] < min_log_data) min_log_data = log_data[i];
+		}
+	}
+
+	double valueRangeSize, medianValue_f;
+	computeRangeSize_double(log_data, dataLength, &valueRangeSize, &medianValue_f);	
+	if(fabs(min_log_data) > max_abs_log_data) max_abs_log_data = fabs(min_log_data);
+	double realPrecision = log2(1.0 + pwrErrRatio) - max_abs_log_data * 2.23e-16;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			log_data[i] = min_log_data - 2.0001*realPrecision;
+		}
+	}
+    TightDataPointStorageD* tdps = SZ_compress_double_2D_MDQ(log_data, r1, r2, realPrecision, valueRangeSize, medianValue_f);
+    tdps->minLogValue = min_log_data - 1.0001*realPrecision;
+    free(log_data);
+
+    if(!positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+    if(*outSize>dataLength*sizeof(double))
+            SZ_compress_args_double_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+
+    free_TightDataPointStorageD(tdps);
+}
+
+void SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t r3, size_t *outSize, double min, double max){
+
+	size_t dataLength = r1 * r2 * r3;
+	double * log_data = (double *) malloc(dataLength * sizeof(double));
+
+	unsigned char * signs = (unsigned char *) malloc(dataLength);
+	memset(signs, 0, dataLength);
+	// preprocess
+	double max_abs_log_data;
+    if(min == 0) max_abs_log_data = fabs(log2(fabs(max)));
+    else if(max == 0) max_abs_log_data = fabs(log2(fabs(min)));
+    else max_abs_log_data = fabs(log2(fabs(min))) > fabs(log2(fabs(max))) ? fabs(log2(fabs(min))) : fabs(log2(fabs(max)));
+    double min_log_data = max_abs_log_data;
+	bool positive = true;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] < 0){
+			signs[i] = 1;
+			log_data[i] = -oriData[i];
+			positive = false;
+		}
+		else
+			log_data[i] = oriData[i];
+		if(log_data[i] > 0){
+			log_data[i] = log2(log_data[i]);
+			if(log_data[i] > max_abs_log_data) max_abs_log_data = log_data[i];
+			if(log_data[i] < min_log_data) min_log_data = log_data[i];
+		}
+	}
+
+	double valueRangeSize, medianValue_f;
+	computeRangeSize_double(log_data, dataLength, &valueRangeSize, &medianValue_f);	
+	if(fabs(min_log_data) > max_abs_log_data) max_abs_log_data = fabs(min_log_data);
+	double realPrecision = log2(1.0 + pwrErrRatio) - max_abs_log_data * 2.23e-16;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			log_data[i] = min_log_data - 2.0001*realPrecision;
+		}
+	}
+    TightDataPointStorageD* tdps = SZ_compress_double_3D_MDQ(log_data, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
+    tdps->minLogValue = min_log_data - 1.0001*realPrecision;
+    free(log_data);
+    if(!positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+    if(*outSize>dataLength*sizeof(double))
+            SZ_compress_args_double_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+
+    free_TightDataPointStorageD(tdps);
+}
diff --git a/thirdparty/SZ/sz/src/sz_double_ts.c b/thirdparty/SZ/sz/src/sz_double_ts.c
index b83562e05a525f87cabb2825f41e91820f5e2ba5..de9f7cda5df5f5ea59962579583ce12605a38911 100644
--- a/thirdparty/SZ/sz/src/sz_double_ts.c
+++ b/thirdparty/SZ/sz/src/sz_double_ts.c
@@ -66,7 +66,7 @@ unsigned int optimize_intervals_double_1D_ts(double *oriData, size_t dataLength,
 TightDataPointStorageD* SZ_compress_double_1D_MDQ_ts(double *oriData, size_t dataLength, sz_multisteps* multisteps,
 double realPrecision, double valueRangeSize, double medianValue_d)
 {
-double* preStepData = (double*)(multisteps->hist_data);
+	double* preStepData = (double*)(multisteps->hist_data);
 	//store the decompressed data
 	double* decData = (double*)malloc(sizeof(double)*dataLength);
 	memset(decData, 0, sizeof(double)*dataLength);
diff --git a/thirdparty/SZ/sz/src/sz_float.c b/thirdparty/SZ/sz/src/sz_float.c
index c0a2a18b36adaa0bd9cd84d943606e7fcde8e792..74d7f20e8e9acde6150c1608fdae0b5ef31cec4f 100644
--- a/thirdparty/SZ/sz/src/sz_float.c
+++ b/thirdparty/SZ/sz/src/sz_float.c
@@ -1,6 +1,6 @@
 /**
  *  @file sz_float.c
- *  @author Sheng Di and Dingwen Tao
+ *  @author Sheng Di, Dingwen Tao, Xin Liang
  *  @date Aug, 2016
  *  @brief SZ_Init, Compression and Decompression functions
  *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
@@ -25,6 +25,7 @@
 #include "zlib.h"
 #include "rw.h"
 #include "sz_float_ts.h"
+#include "utility.h"
 
 unsigned char* SZ_skip_compress_float(float* data, size_t dataLength, size_t* outSize)
 {
@@ -405,7 +406,7 @@ size_t dataLength, double realPrecision, float valueRangeSize, float medianValue
 		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
 		pred = last3CmprsData[0];
 		predAbsErr = fabs(curData - pred);	
-		if(predAbsErr<=checkRadius)
+		if(predAbsErr<checkRadius)
 		{
 			state = (predAbsErr/realPrecision+1)/2;
 			if(curData>=pred)
@@ -1356,8 +1357,11 @@ char SZ_compress_args_float_NoCkRngeNoGzip_3D(unsigned char** newByteData, float
 			compressionType = 1; //time-series based compression 
 		}
 		else
-		{	
-			tdps = SZ_compress_float_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
+		{
+			if(sz_with_regression == SZ_NO_REGRESSION)	
+				tdps = SZ_compress_float_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
+			else
+				*newByteData = SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(oriData, r1, r2, r3, realPrecision, outSize);
 			compressionType = 0; //snapshot-based compression
 			multisteps->lastSnapshotStep = timestep;
 		}		
@@ -1366,14 +1370,14 @@ char SZ_compress_args_float_NoCkRngeNoGzip_3D(unsigned char** newByteData, float
 #endif
 		tdps = SZ_compress_float_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
 
+	if(tdps!=NULL)
+	{
+		convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+		if(*outSize>dataLength*sizeof(float))
+			SZ_compress_args_float_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+		free_TightDataPointStorageF(tdps);
+	}
 
-	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
-
-	if(*outSize>dataLength*sizeof(float))
-		SZ_compress_args_float_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
-
-	free_TightDataPointStorageF(tdps);
-	
 	return compressionType;
 }
 
@@ -1770,8 +1774,8 @@ int errBoundMode, double absErr_Bound, double relBoundRatio, double pwrErrRatio)
 		{
 			if(errBoundMode>=PW_REL)
 			{	
-				//SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr(newByteData, oriData, realPrecision, r1, outSize, min, max);
-				SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(newByteData, oriData, r1, absErr_Bound, relBoundRatio, pwrErrRatio, valueRangeSize, medianValue, outSize);
+				SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log(newByteData, oriData, pwrErrRatio, r1, outSize, min, max);
+				//SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(newByteData, oriData, r1, absErr_Bound, relBoundRatio, pwrErrRatio, valueRangeSize, medianValue, outSize);
 			}
 			else
 				SZ_compress_args_float_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, medianValue);
@@ -1779,21 +1783,21 @@ int errBoundMode, double absErr_Bound, double relBoundRatio, double pwrErrRatio)
 		else if(r5==0&&r4==0&&r3==0)
 		{
 			if(errBoundMode>=PW_REL)
-				SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr(newByteData, oriData, realPrecision, r2, r1, outSize, min, max);
+				SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log(newByteData, oriData, pwrErrRatio, r2, r1, outSize, min, max);
 			else
 				SZ_compress_args_float_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
 		}
 		else if(r5==0&&r4==0)
 		{
 			if(errBoundMode>=PW_REL)
-				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(newByteData, oriData, realPrecision, r3, r2, r1, outSize, min, max);
+				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(newByteData, oriData, pwrErrRatio, r3, r2, r1, outSize, min, max);
 			else
 				SZ_compress_args_float_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
 		}
 		else if(r5==0)
 		{
 			if(errBoundMode>=PW_REL)
-				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(newByteData, oriData, realPrecision, r4*r3, r2, r1, outSize, min, max);
+				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(newByteData, oriData, pwrErrRatio, r4*r3, r2, r1, outSize, min, max);
 			else
 				SZ_compress_args_float_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
 		}
@@ -1854,9 +1858,8 @@ int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRa
 		{
 			if(confparams_cpr->errorBoundMode>=PW_REL)
 			{
-				//SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr(&tmpByteData, oriData, realPrecision, r1, &tmpOutSize, min, max);
-				SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(&tmpByteData, oriData, r1, absErr_Bound, relBoundRatio, pwRelBoundRatio, 
-				valueRangeSize, medianValue, &tmpOutSize);
+				SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r1, &tmpOutSize, min, max);
+				//SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(&tmpByteData, oriData, r1, absErr_Bound, relBoundRatio, pwRelBoundRatio, valueRangeSize, medianValue, &tmpOutSize);
 			}
 			else
 #ifdef HAVE_TIMECMPR
@@ -1870,33 +1873,43 @@ int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRa
 		if (r3==0)
 		{			
 			if(confparams_cpr->errorBoundMode>=PW_REL)
-				SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr(&tmpByteData, oriData, realPrecision, r2, r1, &tmpOutSize, min, max);
+				SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r2, r1, &tmpOutSize, min, max);
 			else
 #ifdef HAVE_TIMECMPR
 				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)				
 					multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
 				else
 #endif
-					SZ_compress_args_float_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				{	
+					if(sz_with_regression == SZ_NO_REGRESSION)
+						SZ_compress_args_float_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+					else 
+						tmpByteData = SZ_compress_float_2D_MDQ_nonblocked_with_blocked_regression(oriData, r2, r1, realPrecision, &tmpOutSize);					
+				}
 		}
 		else
 		if (r4==0)
 		{
 			if(confparams_cpr->errorBoundMode>=PW_REL)
-				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(&tmpByteData, oriData, realPrecision, r3, r2, r1, &tmpOutSize, min, max);
+				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r3, r2, r1, &tmpOutSize, min, max);
 			else
 #ifdef HAVE_TIMECMPR
 				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)				
-					multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+						multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
 				else
 #endif
-					SZ_compress_args_float_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				{
+					if(sz_with_regression == SZ_NO_REGRESSION)
+						SZ_compress_args_float_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+					else 
+						tmpByteData = SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(oriData, r3, r2, r1, realPrecision, &tmpOutSize);
+				}
 		}
 		else
 		if (r5==0)
 		{
 			if(confparams_cpr->errorBoundMode>=PW_REL)		
-				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(&tmpByteData, oriData, realPrecision, r4*r3, r2, r1, &tmpOutSize, min, max);
+				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r4*r3, r2, r1, &tmpOutSize, min, max);
 				//ToDO
 				//SZ_compress_args_float_NoCkRngeNoGzip_4D_pwr(&tmpByteData, oriData, r4, r3, r2, r1, &tmpOutSize, min, max);
 			else
@@ -1905,7 +1918,12 @@ int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRa
 					multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
 				else
 #endif
-					SZ_compress_args_float_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				{
+					if(sz_with_regression == SZ_NO_REGRESSION)
+						SZ_compress_args_float_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+					else 
+						tmpByteData = SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(oriData, r4*r3, r2, r1, realPrecision, &tmpOutSize);								
+				}
 		}
 		else
 		{
@@ -1920,7 +1938,7 @@ int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRa
 		}
 		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION || confparams_cpr->szMode==SZ_TEMPORAL_COMPRESSION)
 		{
-			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
 			free(tmpByteData);
 		}
 		else
@@ -3374,7 +3392,7 @@ unsigned int optimize_intervals_float_3D_opt(float *oriData, size_t r1, size_t r
 	float pred_value = 0, pred_err;
 	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
 	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
-	size_t totalSampleSize = 0;//(r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	size_t totalSampleSize = 0;
 
 	size_t offset_count = confparams_cpr->sampleDistance - 2; // count r3 offset
 	size_t offset_count_2;
@@ -3389,11 +3407,8 @@ unsigned int optimize_intervals_float_3D_opt(float *oriData, size_t r1, size_t r
 		if(radiusIndex>=confparams_cpr->maxRangeRadius)
 		{
 			radiusIndex = confparams_cpr->maxRangeRadius - 1;
-			//printf("radiusIndex=%d\n", radiusIndex);
 		}
 		intervals[radiusIndex]++;
-		// printf("TEST: %ld, i: %ld\tj: %ld\tk: %ld\n", data_pos - oriData);
-		// fflush(stdout);
 		offset_count += confparams_cpr->sampleDistance;
 		if(offset_count >= r3){
 			n2_count ++;
@@ -3409,9 +3424,6 @@ unsigned int optimize_intervals_float_3D_opt(float *oriData, size_t r1, size_t r
 		}
 		else data_pos += confparams_cpr->sampleDistance;
 	}	
-	// printf("sample_count: %ld\n", sample_count);
-	// fflush(stdout);
-	// if(*max_freq < 0.15) *max_freq *= 2;
 	//compute the appropriate number
 	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
 	size_t sum = 0;
@@ -3429,7 +3441,6 @@ unsigned int optimize_intervals_float_3D_opt(float *oriData, size_t r1, size_t r
 	if(powerOf2<32)
 		powerOf2 = 32;
 	free(intervals);
-	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
 	return powerOf2;
 }
 
@@ -3749,10 +3760,7 @@ unsigned int optimize_intervals_float_2D_opt(float *oriData, size_t r1, size_t r
 	float pred_value = 0, pred_err;
 	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
 	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
-	size_t totalSampleSize = 0;//(r1-1)*(r2-1)/confparams_cpr->sampleDistance;
-
-	//float max = oriData[0];
-	//float min = oriData[0];
+	size_t totalSampleSize = 0;
 
 	size_t offset_count = confparams_cpr->sampleDistance - 1; // count r2 offset
 	size_t offset_count_2;
@@ -3811,7 +3819,6 @@ unsigned int optimize_intervals_float_1D_opt(float *oriData, size_t dataLength,
 	float * data_pos = oriData + 2;
 	while(data_pos - oriData < dataLength){
 		totalSampleSize++;
-		//pred_value = 2*data_pos[-1] - data_pos[-2];
 		pred_value = data_pos[-1];
 		pred_err = fabs(pred_value - *data_pos);
 		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
@@ -3840,7 +3847,6 @@ unsigned int optimize_intervals_float_1D_opt(float *oriData, size_t dataLength,
 		powerOf2 = 32;
 	
 	free(intervals);
-	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
 	return powerOf2;
 }
 
@@ -4036,3 +4042,2790 @@ size_t SZ_compress_float_2D_MDQ_RA_block(float * block_ori_data, float * mean, s
 	return unpredictable_count;
 }
 
+/*The above code is for sz 1.4.13; the following code is for sz 2.0*/
+
+unsigned int optimize_intervals_float_2D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq)
+{	
+	float mean = 0.0;
+	size_t len = r1 * r2;
+	size_t mean_distance = (int) (sqrt(len));
+
+	float * data_pos = oriData;
+	size_t mean_count = 0;
+	while(data_pos - oriData < len){
+		mean += *data_pos;
+		mean_count ++;
+		data_pos += mean_distance;
+	}
+	if(mean_count > 0) mean /= mean_count;
+	size_t range = 8192;
+	size_t radius = 4096;
+	size_t * freq_intervals = (size_t *) malloc(range*sizeof(size_t));
+	memset(freq_intervals, 0, range*sizeof(size_t));
+
+	unsigned int maxRangeRadius = confparams_cpr->maxRangeRadius;
+	int sampleDistance = confparams_cpr->sampleDistance;
+	float predThreshold = confparams_cpr->predThreshold;
+
+	size_t i;
+	size_t radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, maxRangeRadius*sizeof(size_t));
+
+	float mean_diff;
+	ptrdiff_t freq_index;
+	size_t freq_count = 0;
+	size_t n1_count = 1;
+	size_t offset_count = sampleDistance - 1;
+	size_t offset_count_2 = 0;
+	size_t sample_count = 0;
+	data_pos = oriData + r2 + offset_count;
+	while(data_pos - oriData < len){
+		pred_value = data_pos[-1] + data_pos[-r2] - data_pos[-r2-1];
+		pred_err = fabs(pred_value - *data_pos);
+		if(pred_err < realPrecision) freq_count ++;
+		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+		if(radiusIndex>=maxRangeRadius)
+			radiusIndex = maxRangeRadius - 1;
+		intervals[radiusIndex]++;
+
+		mean_diff = *data_pos - mean;
+		if(mean_diff > 0) freq_index = (ptrdiff_t)(mean_diff/realPrecision) + radius;
+		else freq_index = (ptrdiff_t)(mean_diff/realPrecision) - 1 + radius;
+		if(freq_index <= 0){
+			freq_intervals[0] ++;
+		}
+		else if(freq_index >= range){
+			freq_intervals[range - 1] ++;
+		}
+		else{
+			freq_intervals[freq_index] ++;
+		}
+		offset_count += sampleDistance;
+		if(offset_count >= r2){
+			n1_count ++;
+			offset_count_2 = n1_count % sampleDistance;
+			data_pos += (r2 + sampleDistance - offset_count) + (sampleDistance - offset_count_2);
+			offset_count = (sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += sampleDistance;
+		sample_count ++;
+	}
+	*max_freq = freq_count * 1.0/ sample_count;
+
+	//compute the appropriate number
+	size_t targetCount = sample_count*predThreshold;
+	size_t sum = 0;
+	for(i=0;i<maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=maxRangeRadius)
+		i = maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	// collect frequency
+	size_t max_sum = 0;
+	size_t max_index = 0;
+	size_t tmp_sum;
+	size_t * freq_pos = freq_intervals + 1;
+	for(size_t i=1; i<range-2; i++){
+		tmp_sum = freq_pos[0] + freq_pos[1];
+		if(tmp_sum > max_sum){
+			max_sum = tmp_sum;
+			max_index = i;
+		}
+		freq_pos ++;
+	}
+	*dense_pos = mean + realPrecision * (ptrdiff_t)(max_index + 1 - radius);
+	*mean_freq = max_sum * 1.0 / sample_count;
+
+	free(freq_intervals);
+	free(intervals);
+	return powerOf2;
+}
+
+// 2D:  modified for higher performance
+#define MIN(a, b) a<b? a : b
+unsigned char * SZ_compress_float_2D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size){
+
+	unsigned int quantization_intervals;
+	float sz_sample_correct_freq = -1;//0.5; //-1
+	float dense_pos;
+	float mean_flush_freq;
+	unsigned char use_mean = 0;
+
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_2D_with_freq_and_dense_pos(oriData, r1, r2, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+		if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+
+	// calculate block dims
+	size_t num_x, num_y;
+	size_t block_size = 16;
+
+	SZ_COMPUTE_2D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_2D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+
+	size_t split_index_x, split_index_y;
+	size_t early_blockcount_x, early_blockcount_y;
+	size_t late_blockcount_x, late_blockcount_y;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+
+	size_t max_num_block_elements = early_blockcount_x * early_blockcount_y;
+	size_t num_blocks = num_x * num_y;
+	size_t num_elements = r1 * r2;
+
+	size_t dim0_offset = r2;	
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+	size_t total_unpred = 0;
+	size_t unpredictable_count;
+	float * data_pos = oriData;
+	int * type = result_type;
+	size_t offset_x, offset_y;
+	size_t current_blockcount_x, current_blockcount_y;
+
+	float * reg_params = (float *) malloc(num_blocks * 4 * sizeof(float));
+	float * reg_params_pos = reg_params;
+	// move regression part out
+	size_t params_offset_b = num_blocks;
+	size_t params_offset_c = 2*num_blocks;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+
+			data_pos = oriData + offset_x * dim0_offset + offset_y;
+
+			{
+				float * cur_data_pos = data_pos;
+				float fx = 0.0;
+				float fy = 0.0;
+				float f = 0;
+				double sum_x; 
+				float curData;
+				for(size_t i=0; i<current_blockcount_x; i++){
+					sum_x = 0;
+					for(size_t j=0; j<current_blockcount_y; j++){
+						curData = *cur_data_pos;
+						sum_x += curData;
+						fy += curData * j;
+						cur_data_pos ++;
+					}
+					fx += sum_x * i;
+					f += sum_x;
+					cur_data_pos += dim0_offset - current_blockcount_y;
+				}
+				float coeff = 1.0 / (current_blockcount_x * current_blockcount_y);
+				reg_params_pos[0] = (2 * fx / (current_blockcount_x - 1) - f) * 6 * coeff / (current_blockcount_x + 1);
+				reg_params_pos[params_offset_b] = (2 * fy / (current_blockcount_y - 1) - f) * 6 * coeff / (current_blockcount_y + 1);
+				reg_params_pos[params_offset_c] = f * coeff - ((current_blockcount_x - 1) * reg_params_pos[0] / 2 + (current_blockcount_y - 1) * reg_params_pos[params_offset_b] / 2);
+			}
+
+			reg_params_pos ++;
+		}
+	}
+
+	//Compress coefficient arrays
+	double precision_a, precision_b, precision_c;
+	float rel_param_err = 0.15/3;
+	precision_a = rel_param_err * realPrecision / late_blockcount_x;
+	precision_b = rel_param_err * realPrecision / late_blockcount_y;
+	precision_c = rel_param_err * realPrecision;
+
+	float mean = 0;
+	use_mean = 0;
+	if(use_mean){
+		// compute mean
+		double sum = 0.0;
+		size_t mean_count = 0;
+		for(size_t i=0; i<num_elements; i++){
+			if(fabs(oriData[i] - dense_pos) < realPrecision){
+				sum += oriData[i];
+				mean_count ++;
+			}
+		}
+		if(mean_count > 0) mean = sum / mean_count;
+	}
+
+
+	double tmp_realPrecision = realPrecision;
+
+	// use two prediction buffers for higher performance
+	float * unpredictable_data = result_unpredictable_data;
+	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+	memset(indicator, 0, num_blocks * sizeof(unsigned char));
+	size_t reg_count = 0;
+	size_t strip_dim_0 = early_blockcount_x + 1;
+	size_t strip_dim_1 = r2 + 1;
+	size_t strip_dim0_offset = strip_dim_1;
+	unsigned char * indicator_pos = indicator;
+	size_t prediction_buffer_size = strip_dim_0 * strip_dim0_offset * sizeof(float);
+	float * prediction_buffer_1 = (float *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_1, 0, prediction_buffer_size);
+	float * prediction_buffer_2 = (float *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_2, 0, prediction_buffer_size);
+	float * cur_pb_buf = prediction_buffer_1;
+	float * next_pb_buf = prediction_buffer_2;
+	float * cur_pb_buf_pos;
+	float * next_pb_buf_pos;
+	int intvCapacity = exe_params->intvCapacity;
+	int intvRadius = exe_params->intvRadius;
+	int use_reg = 0;
+
+	reg_params_pos = reg_params;
+	// compress the regression coefficients on the fly
+	float last_coeffcients[3] = {0.0};
+	int coeff_intvCapacity_sz = 65536;
+	int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+	int * coeff_type[3];
+	int * coeff_result_type = (int *) malloc(num_blocks*3*sizeof(int));
+	float * coeff_unpred_data[3];
+	float * coeff_unpredictable_data = (float *) malloc(num_blocks*3*sizeof(float));
+	double precision[3];
+	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c;
+	for(int i=0; i<3; i++){
+		coeff_type[i] = coeff_result_type + i * num_blocks;
+		coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
+	}
+	int coeff_index = 0;
+	unsigned int coeff_unpredictable_count[3] = {0};
+	if(use_mean){
+		type = result_type;
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			data_pos = oriData + offset_x * dim0_offset;
+
+			cur_pb_buf_pos = cur_pb_buf + strip_dim0_offset + 1;
+			next_pb_buf_pos = next_pb_buf + 1;
+			float * pb_pos = cur_pb_buf_pos;
+			float * next_pb_pos = next_pb_buf_pos;
+
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				
+				/*sampling: decide which predictor to use (regression or lorenzo)*/
+				{
+					float * cur_data_pos;
+					float curData;
+					float pred_reg, pred_sz;
+					float err_sz = 0.0, err_reg = 0.0;
+					// [1, 1] [3, 3] [5, 5] [7, 7] [9, 9]
+					// [1, 9] [3, 7]		[7, 3] [9, 1]
+					int count = 0;
+					for(int i=1; i<current_blockcount_x; i+=2){
+						cur_data_pos = data_pos + i * dim0_offset + i;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c];
+						
+						err_sz += MIN(fabs(pred_sz - curData) + realPrecision*0.81, fabs(mean - curData));
+
+						err_reg += fabs(pred_reg - curData);
+
+						cur_data_pos = data_pos + i * dim0_offset + (block_size - i);
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * (block_size - i) + reg_params_pos[params_offset_c];
+						err_sz += MIN(fabs(pred_sz - curData) + realPrecision*0.81, fabs(mean - curData));
+						
+						err_reg += fabs(pred_reg - curData);
+
+						count += 2;
+					}
+
+					use_reg = (err_reg < err_sz);
+				}
+				if(use_reg)
+				{
+					{
+						/*predict coefficients in current block via previous reg_block*/
+						float cur_coeff;
+						double diff, itvNum;
+						for(int e=0; e<3; e++){
+							cur_coeff = reg_params_pos[e*num_blocks];
+							diff = cur_coeff - last_coeffcients[e];
+							itvNum = fabs(diff)/precision[e] + 1;
+							if (itvNum < coeff_intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+								last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;	
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}					
+							}
+							else{
+								coeff_type[e][coeff_index] = 0;
+								last_coeffcients[e] = cur_coeff;
+								coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+							}
+						}
+						coeff_index ++;
+					}
+					float curData;
+					float pred;
+					double itvNum;
+					double diff;
+					size_t index = 0;
+					size_t block_unpredictable_count = 0;
+					float * cur_data_pos = data_pos;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						size_t ii = current_blockcount_x - 1;
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+
+							index ++;	
+							cur_data_pos ++;
+						}
+					} // end ii == -1
+					unpredictable_count = block_unpredictable_count;
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;					
+					reg_count ++;
+				}// end use_reg
+				else{
+					// use SZ
+					// SZ predication
+					unpredictable_count = 0;
+					float * cur_pb_pos = pb_pos;
+					float * cur_data_pos = data_pos;
+					float curData;
+					float pred2D;
+					double itvNum, diff;
+					size_t index = 0;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+							if(fabs(curData - mean) <= realPrecision){
+								// adjust type[index] to intvRadius for coherence with freq in reg
+								type[index] = intvRadius;
+								*cur_pb_pos = mean;
+							}
+							else
+							{
+								pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+								diff = curData - pred2D;
+								itvNum = fabs(diff)/realPrecision + 1;
+								if (itvNum < intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									type[index] = (int) (itvNum/2) + intvRadius;
+									*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+									if(type[index] <= intvRadius) type[index] -= 1;
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+										type[index] = 0;
+										*cur_pb_pos = curData;	
+										unpredictable_data[unpredictable_count ++] = curData;
+									}					
+								}
+								else{
+									type[index] = 0;
+									*cur_pb_pos = curData;
+									unpredictable_data[unpredictable_count ++] = curData;
+								}
+							}
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+						cur_pb_pos += strip_dim0_offset - current_blockcount_y;
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						// ii == current_blockcount_x - 1
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+							if(fabs(curData - mean) <= realPrecision){
+								// adjust type[index] to intvRadius for coherence with freq in reg
+								type[index] = intvRadius;
+								*cur_pb_pos = mean;
+							}
+							else
+							{
+								pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+								diff = curData - pred2D;
+								itvNum = fabs(diff)/realPrecision + 1;
+								if (itvNum < intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									type[index] = (int) (itvNum/2) + intvRadius;
+									*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+									if(type[index] <= intvRadius) type[index] -= 1;
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+										type[index] = 0;
+										*cur_pb_pos = curData;	
+										unpredictable_data[unpredictable_count ++] = curData;
+									}					
+								}
+								else{
+									type[index] = 0;
+									*cur_pb_pos = curData;
+									unpredictable_data[unpredictable_count ++] = curData;
+								}
+							}
+							next_pb_pos[jj] = *cur_pb_pos;
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+					}
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;
+					// change indicator
+					indicator_pos[j] = 1;
+				}// end SZ
+				reg_params_pos ++;
+				data_pos += current_blockcount_y;
+				pb_pos += current_blockcount_y;
+				next_pb_pos += current_blockcount_y;
+				type += current_blockcount_x * current_blockcount_y;
+			}// end j
+			indicator_pos += num_y;
+			float * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}// end i
+	}// end use mean
+	else{
+		type = result_type;
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			data_pos = oriData + offset_x * dim0_offset;
+
+			cur_pb_buf_pos = cur_pb_buf + strip_dim0_offset + 1;
+			next_pb_buf_pos = next_pb_buf + 1;
+			float * pb_pos = cur_pb_buf_pos;
+			float * next_pb_pos = next_pb_buf_pos;
+
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				/*sampling*/
+				{
+					// sample [2i + 1, 2i + 1] [2i + 1, bs - 2i]
+					float * cur_data_pos;
+					float curData;
+					float pred_reg, pred_sz;
+					float err_sz = 0.0, err_reg = 0.0;
+					// [1, 1] [3, 3] [5, 5] [7, 7] [9, 9]
+					// [1, 9] [3, 7]		[7, 3] [9, 1]
+					int count = 0;
+					for(int i=1; i<current_blockcount_x; i+=2){
+						cur_data_pos = data_pos + i * dim0_offset + i;
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c];
+						err_sz += fabs(pred_sz - curData);
+						err_reg += fabs(pred_reg - curData);
+
+						cur_data_pos = data_pos + i * dim0_offset + (block_size - i);
+						curData = *cur_data_pos;
+						pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+						pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * (block_size - i) + reg_params_pos[params_offset_c];
+						err_sz += fabs(pred_sz - curData);
+						err_reg += fabs(pred_reg - curData);
+
+						count += 2;
+					}
+					err_sz += realPrecision * count * 0.81;
+					use_reg = (err_reg < err_sz);
+
+				}
+				if(use_reg)
+				{
+					{
+						/*predict coefficients in current block via previous reg_block*/
+						float cur_coeff;
+						double diff, itvNum;
+						for(int e=0; e<3; e++){
+							cur_coeff = reg_params_pos[e*num_blocks];
+							diff = cur_coeff - last_coeffcients[e];
+							itvNum = fabs(diff)/precision[e] + 1;
+							if (itvNum < coeff_intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+								last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;	
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}					
+							}
+							else{
+								coeff_type[e][coeff_index] = 0;
+								last_coeffcients[e] = cur_coeff;
+								coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+							}
+						}
+						coeff_index ++;
+					}
+					float curData;
+					float pred;
+					double itvNum;
+					double diff;
+					size_t index = 0;
+					size_t block_unpredictable_count = 0;
+					float * cur_data_pos = data_pos;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							// jj == current_blockcount_y - 1
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						size_t ii = current_blockcount_x - 1;
+						for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+							index ++;	
+							cur_data_pos ++;
+						}
+						/*dealing with the last jj (boundary)*/
+						{
+							// jj == current_blockcount_y - 1
+							size_t jj = current_blockcount_y - 1;
+							curData = *cur_data_pos;
+							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+							diff = curData - pred;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - pred)>realPrecision){	
+									type[index] = 0;
+									pred = curData;
+									unpredictable_data[block_unpredictable_count ++] = curData;
+								}		
+							}
+							else{
+								type[index] = 0;
+								pred = curData;
+								unpredictable_data[block_unpredictable_count ++] = curData;
+							}
+
+							// assign value to block surfaces
+							pb_pos[ii * strip_dim0_offset + jj] = pred;
+							// assign value to next prediction buffer
+							next_pb_pos[jj] = pred;
+
+							index ++;	
+							cur_data_pos ++;
+						}
+					} // end ii == -1
+					unpredictable_count = block_unpredictable_count;
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;					
+					reg_count ++;
+				}// end use_reg
+				else{
+					// use SZ
+					// SZ predication
+					unpredictable_count = 0;
+					float * cur_pb_pos = pb_pos;
+					float * cur_data_pos = data_pos;
+					float curData;
+					float pred2D;
+					double itvNum, diff;
+					size_t index = 0;
+					for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+
+							pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+							diff = curData - pred2D;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+									type[index] = 0;
+									*cur_pb_pos = curData;	
+									unpredictable_data[unpredictable_count ++] = curData;
+								}					
+							}
+							else{
+								type[index] = 0;
+								*cur_pb_pos = curData;
+								unpredictable_data[unpredictable_count ++] = curData;
+							}
+
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+						cur_pb_pos += strip_dim0_offset - current_blockcount_y;
+						cur_data_pos += dim0_offset - current_blockcount_y;
+					}
+					/*dealing with the last ii (boundary)*/
+					{
+						// ii == current_blockcount_x - 1
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							curData = *cur_data_pos;
+
+							pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+							diff = curData - pred2D;
+							itvNum = fabs(diff)/realPrecision + 1;
+							if (itvNum < intvCapacity_sz){
+								if (diff < 0) itvNum = -itvNum;
+								type[index] = (int) (itvNum/2) + intvRadius;
+								*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+								//ganrantee comporession error against the case of machine-epsilon
+								if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+									type[index] = 0;
+									*cur_pb_pos = curData;	
+									unpredictable_data[unpredictable_count ++] = curData;
+								}					
+							}
+							else{
+								type[index] = 0;
+								*cur_pb_pos = curData;
+								unpredictable_data[unpredictable_count ++] = curData;
+							}
+							next_pb_pos[jj] = *cur_pb_pos;
+							index ++;
+							cur_pb_pos ++;
+							cur_data_pos ++;
+						}
+					}
+					total_unpred += unpredictable_count;
+					unpredictable_data += unpredictable_count;
+					// change indicator
+					indicator_pos[j] = 1;
+				}// end SZ
+				reg_params_pos ++;
+				data_pos += current_blockcount_y;
+				pb_pos += current_blockcount_y;
+				next_pb_pos += current_blockcount_y;
+				type += current_blockcount_x * current_blockcount_y;
+			}// end j
+			indicator_pos += num_y;
+			float * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}// end i		
+	}
+	free(prediction_buffer_1);
+	free(prediction_buffer_2);
+
+	int stateNum = 2*quantization_intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	size_t nodeCount = 0;
+	size_t i = 0;
+	init(huffmanTree, result_type, num_elements);
+	for (i = 0; i < stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	// total size 										metadata		  # elements   real precision		intervals	nodeCount		huffman 	 	block index 						unpredicatable count						mean 					 	unpred size 				elements
+	unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + treeByteSize + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(float) + total_unpred * sizeof(float) + num_elements * sizeof(int), 1);
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	result_pos += meta_data_offset;
+
+	sizeToBytes(result_pos, num_elements);
+	result_pos += exe_params->SZ_SIZE_TYPE;
+	
+	intToBytes_bigEndian(result_pos, block_size);
+	result_pos += sizeof(int);
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += sizeof(double);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += sizeof(int);
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+	free(treeBytes);
+
+	memcpy(result_pos, &use_mean, sizeof(unsigned char));
+	result_pos += sizeof(unsigned char);
+	memcpy(result_pos, &mean, sizeof(float));
+	result_pos += sizeof(float);
+
+	size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+	result_pos += indicator_size;
+	
+	//convert the lead/mid/resi to byte stream 	
+	if(reg_count>0){
+		for(int e=0; e<3; e++){
+			int stateNum = 2*coeff_intvCapacity_sz;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+			size_t nodeCount = 0;
+			init(huffmanTree, coeff_type[e], reg_count);
+			size_t i = 0;
+			for (i = 0; i < huffmanTree->stateNum; i++)
+				if (huffmanTree->code[i]) nodeCount++; 
+			nodeCount = nodeCount*2-1;
+			unsigned char *treeBytes;
+			unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+			doubleToBytes(result_pos, precision[e]);
+			result_pos += sizeof(double);
+			intToBytes_bigEndian(result_pos, coeff_intvRadius);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, treeByteSize);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, nodeCount);
+			result_pos += sizeof(int);
+			memcpy(result_pos, treeBytes, treeByteSize);		
+			result_pos += treeByteSize;
+			free(treeBytes);
+			size_t typeArray_size = 0;
+			encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+			sizeToBytes(result_pos, typeArray_size);
+			result_pos += sizeof(size_t) + typeArray_size;
+			intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+			result_pos += sizeof(int);
+			memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(float));
+			result_pos += coeff_unpredictable_count[e]*sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	free(coeff_result_type);
+	free(coeff_unpredictable_data);
+
+	//record the number of unpredictable data and also store them
+	memcpy(result_pos, &total_unpred, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(float));
+	result_pos += total_unpred * sizeof(float);
+	size_t typeArray_size = 0;
+	encode(huffmanTree, result_type, num_elements, result_pos, &typeArray_size);
+	result_pos += typeArray_size;
+
+	size_t totalEncodeSize = result_pos - result;
+	free(indicator);
+	free(result_unpredictable_data);
+	free(result_type);
+	free(reg_params);
+	
+	SZ_ReleaseHuffman(huffmanTree);
+	*comp_size = totalEncodeSize;
+
+	return result;
+}
+
+unsigned int optimize_intervals_float_3D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq)
+{	
+	float mean = 0.0;
+	size_t len = r1 * r2 * r3;
+	size_t mean_distance = (int) (sqrt(len));
+	float * data_pos = oriData;
+	size_t offset_count = 0;
+	size_t offset_count_2 = 0;
+	size_t mean_count = 0;
+	while(data_pos - oriData < len){
+		mean += *data_pos;
+		mean_count ++;
+		data_pos += mean_distance;
+		offset_count += mean_distance;
+		offset_count_2 += mean_distance;
+		if(offset_count >= r3){
+			offset_count = 0;
+			data_pos -= 1;
+		}
+		if(offset_count_2 >= r2 * r3){
+			offset_count_2 = 0;
+			data_pos -= 1;
+		}
+	}
+	if(mean_count > 0) mean /= mean_count;
+	size_t range = 8192;
+	size_t radius = 4096;
+	size_t * freq_intervals = (size_t *) malloc(range*sizeof(size_t));
+	memset(freq_intervals, 0, range*sizeof(size_t));
+
+	unsigned int maxRangeRadius = confparams_cpr->maxRangeRadius;
+	int sampleDistance = confparams_cpr->sampleDistance;
+	float predThreshold = confparams_cpr->predThreshold;
+
+	size_t i;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, maxRangeRadius*sizeof(size_t));
+
+	float mean_diff;
+	ptrdiff_t freq_index;
+	size_t freq_count = 0;
+	size_t sample_count = 0;
+
+	offset_count = confparams_cpr->sampleDistance - 2; // count r3 offset
+	data_pos = oriData + r23 + r3 + offset_count;
+	size_t n1_count = 1, n2_count = 1; // count i,j sum
+
+	while(data_pos - oriData < len){
+
+		pred_value = data_pos[-1] + data_pos[-r3] + data_pos[-r23] - data_pos[-1-r23] - data_pos[-r3-1] - data_pos[-r3-r23] + data_pos[-r3-r23-1];
+		pred_err = fabs(pred_value - *data_pos);
+		if(pred_err < realPrecision) freq_count ++;
+		radiusIndex = (pred_err/realPrecision+1)/2;
+		if(radiusIndex>=maxRangeRadius)
+		{
+			radiusIndex = maxRangeRadius - 1;
+		}
+		intervals[radiusIndex]++;
+
+		mean_diff = *data_pos - mean;
+		if(mean_diff > 0) freq_index = (ptrdiff_t)(mean_diff/realPrecision) + radius;
+		else freq_index = (ptrdiff_t)(mean_diff/realPrecision) - 1 + radius;
+		if(freq_index <= 0){
+			freq_intervals[0] ++;
+		}
+		else if(freq_index >= range){
+			freq_intervals[range - 1] ++;
+		}
+		else{
+			freq_intervals[freq_index] ++;
+		}
+		offset_count += sampleDistance;
+		if(offset_count >= r3){
+			n2_count ++;
+			if(n2_count == r2){
+				n1_count ++;
+				n2_count = 1;
+				data_pos += r3;
+			}
+			offset_count_2 = (n1_count + n2_count) % sampleDistance;
+			data_pos += (r3 + sampleDistance - offset_count) + (sampleDistance - offset_count_2);
+			offset_count = (sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += sampleDistance;
+		sample_count ++;
+	}	
+	*max_freq = freq_count * 1.0/ sample_count;
+
+	//compute the appropriate number
+	size_t targetCount = sample_count*predThreshold;
+	size_t sum = 0;
+	for(i=0;i<maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=maxRangeRadius)
+		i = maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	// collect frequency
+	size_t max_sum = 0;
+	size_t max_index = 0;
+	size_t tmp_sum;
+	size_t * freq_pos = freq_intervals + 1;
+	for(size_t i=1; i<range-2; i++){
+		tmp_sum = freq_pos[0] + freq_pos[1];
+		if(tmp_sum > max_sum){
+			max_sum = tmp_sum;
+			max_index = i;
+		}
+		freq_pos ++;
+	}
+	*dense_pos = mean + realPrecision * (ptrdiff_t)(max_index + 1 - radius);
+	*mean_freq = max_sum * 1.0 / sample_count;
+
+	free(freq_intervals);
+	free(intervals);
+	return powerOf2;
+}
+
+
+// 3D:  modified for higher performance
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size){
+
+#ifdef HAVE_TIMECMPR	
+	float* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (float*)(multisteps->hist_data);
+#endif
+
+	unsigned int quantization_intervals;
+	float sz_sample_correct_freq = -1;//0.5; //-1
+	float dense_pos;
+	float mean_flush_freq;
+	unsigned char use_mean = 0;
+
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	size_t block_size = 6;
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r3, num_z, block_size);
+
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t max_num_block_elements = early_blockcount_x * early_blockcount_y * early_blockcount_z;
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t num_elements = r1 * r2 * r3;
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;	
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+	size_t total_unpred = 0;
+	size_t unpredictable_count;
+	size_t max_unpred_count = 0;
+	float * data_pos = oriData;
+	int * type = result_type;
+	size_t type_offset;
+	size_t offset_x, offset_y, offset_z;
+	size_t current_blockcount_x, current_blockcount_y, current_blockcount_z;
+
+	float * reg_params = (float *) malloc(num_blocks * 4 * sizeof(float));
+	float * reg_params_pos = reg_params;
+	// move regression part out
+	size_t params_offset_b = num_blocks;
+	size_t params_offset_c = 2*num_blocks;
+	size_t params_offset_d = 3*num_blocks;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			for(size_t k=0; k<num_z; k++){
+				current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+				offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+	
+				data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+				/*Calculate regression coefficients*/
+				{
+					float * cur_data_pos = data_pos;
+					float fx = 0.0;
+					float fy = 0.0;
+					float fz = 0.0;
+					float f = 0;
+					float sum_x, sum_y; 
+					float curData;
+					for(size_t i=0; i<current_blockcount_x; i++){
+						sum_x = 0;
+						for(size_t j=0; j<current_blockcount_y; j++){
+							sum_y = 0;
+							for(size_t k=0; k<current_blockcount_z; k++){
+								curData = *cur_data_pos;
+								// f += curData;
+								// fx += curData * i;
+								// fy += curData * j;
+								// fz += curData * k;
+								sum_y += curData;
+								fz += curData * k;
+								cur_data_pos ++;
+							}
+							fy += sum_y * j;
+							sum_x += sum_y;
+							cur_data_pos += dim1_offset - current_blockcount_z;
+						}
+						fx += sum_x * i;
+						f += sum_x;
+						cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+					}
+					float coeff = 1.0 / (current_blockcount_x * current_blockcount_y * current_blockcount_z);
+					reg_params_pos[0] = (2 * fx / (current_blockcount_x - 1) - f) * 6 * coeff / (current_blockcount_x + 1);
+					reg_params_pos[params_offset_b] = (2 * fy / (current_blockcount_y - 1) - f) * 6 * coeff / (current_blockcount_y + 1);
+					reg_params_pos[params_offset_c] = (2 * fz / (current_blockcount_z - 1) - f) * 6 * coeff / (current_blockcount_z + 1);
+					reg_params_pos[params_offset_d] = f * coeff - ((current_blockcount_x - 1) * reg_params_pos[0] / 2 + (current_blockcount_y - 1) * reg_params_pos[params_offset_b] / 2 + (current_blockcount_z - 1) * reg_params_pos[params_offset_c] / 2);
+				}
+				reg_params_pos ++;
+			}
+		}
+	}
+	
+	//Compress coefficient arrays
+	double precision_a, precision_b, precision_c, precision_d;
+	float rel_param_err = 0.025;
+	precision_a = rel_param_err * realPrecision / late_blockcount_x;
+	precision_b = rel_param_err * realPrecision / late_blockcount_y;
+	precision_c = rel_param_err * realPrecision / late_blockcount_z;
+	precision_d = rel_param_err * realPrecision;
+
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_3D_with_freq_and_dense_pos(oriData, r1, r2, r3, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+		if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+
+	float mean = 0;
+	if(use_mean){
+		// compute mean
+		double sum = 0.0;
+		size_t mean_count = 0;
+		for(size_t i=0; i<num_elements; i++){
+			if(fabs(oriData[i] - dense_pos) < realPrecision){
+				sum += oriData[i];
+				mean_count ++;
+			}
+		}
+		if(mean_count > 0) mean = sum / mean_count;
+	}
+
+	double tmp_realPrecision = realPrecision;
+
+	// use two prediction buffers for higher performance
+	float * unpredictable_data = result_unpredictable_data;
+	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+	memset(indicator, 0, num_blocks * sizeof(unsigned char));
+	size_t reg_count = 0;
+	size_t strip_dim_0 = early_blockcount_x + 1;
+	size_t strip_dim_1 = r2 + 1;
+	size_t strip_dim_2 = r3 + 1;
+	size_t strip_dim0_offset = strip_dim_1 * strip_dim_2;
+	size_t strip_dim1_offset = strip_dim_2;
+	unsigned char * indicator_pos = indicator;
+
+	size_t prediction_buffer_size = strip_dim_0 * strip_dim0_offset * sizeof(float);
+	float * prediction_buffer_1 = (float *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_1, 0, prediction_buffer_size);
+	float * prediction_buffer_2 = (float *) malloc(prediction_buffer_size);
+	memset(prediction_buffer_2, 0, prediction_buffer_size);
+	float * cur_pb_buf = prediction_buffer_1;
+	float * next_pb_buf = prediction_buffer_2;
+	float * cur_pb_buf_pos;
+	float * next_pb_buf_pos;
+	int intvCapacity = exe_params->intvCapacity;
+	int intvRadius = exe_params->intvRadius;	
+	int use_reg = 0;
+	float noise = realPrecision * 1.22;
+
+	reg_params_pos = reg_params;
+	// compress the regression coefficients on the fly
+	float last_coeffcients[4] = {0.0};
+	int coeff_intvCapacity_sz = 65536;
+	int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+	int * coeff_type[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	float * coeff_unpred_data[4];
+	float * coeff_unpredictable_data = (float *) malloc(num_blocks*4*sizeof(float));
+	double precision[4];
+	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c, precision[3] = precision_d;
+	for(int i=0; i<4; i++){
+		coeff_type[i] = coeff_result_type + i * num_blocks;
+		coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
+	}
+	int coeff_index = 0;
+	unsigned int coeff_unpredictable_count[4] = {0};
+
+	if(use_mean){
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset;
+				type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset;
+				type = result_type + type_offset;
+
+				// prediction buffer is (current_block_count_x + 1) * (current_block_count_y + 1) * (current_block_count_z + 1)
+				cur_pb_buf_pos = cur_pb_buf + offset_y * strip_dim1_offset + strip_dim0_offset + strip_dim1_offset + 1;
+				next_pb_buf_pos = next_pb_buf + offset_y * strip_dim1_offset + strip_dim1_offset + 1;
+
+				size_t current_blockcount_z;
+				float * pb_pos = cur_pb_buf_pos;
+				float * next_pb_pos = next_pb_buf_pos;
+				size_t strip_unpredictable_count = 0;
+				for(size_t k=0; k<num_z; k++){
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+#ifdef HAVE_TIMECMPR
+					size_t offset_z = 0;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					size_t block_offset = offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+#endif
+					/*sampling and decide which predictor*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						float * cur_data_pos;
+						float curData;
+						float pred_reg, pred_sz;
+						float err_sz = 0.0, err_reg = 0.0;
+						int bmi = 0;
+						if(i>0 && j>0 && k>0){
+							for(int i=0; i<block_size; i++){
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);
+
+								bmi = block_size - i;
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);
+							}
+						}
+						else{
+							for(int i=1; i<block_size; i++){
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);
+
+								bmi = block_size - i;
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+								err_reg += fabs(pred_reg - curData);								
+
+							}
+						}
+						use_reg = (err_reg < err_sz);
+					}
+					if(use_reg){
+						{
+							/*predict coefficients in current block via previous reg_block*/
+							float cur_coeff;
+							double diff, itvNum;
+							for(int e=0; e<4; e++){
+								cur_coeff = reg_params_pos[e*num_blocks];
+								diff = cur_coeff - last_coeffcients[e];
+								itvNum = fabs(diff)/precision[e] + 1;
+								if (itvNum < coeff_intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+									last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+										coeff_type[e][coeff_index] = 0;
+										last_coeffcients[e] = cur_coeff;	
+										coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+									}					
+								}
+								else{
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}
+							}
+							coeff_index ++;
+						}
+						float curData;
+						float pred;
+						double itvNum;
+						double diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						float * cur_data_pos = data_pos;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+									
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = pred;
+#endif									
+									
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							size_t ii = current_blockcount_x - 1;
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = pred;
+#endif									
+
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									// assign value to next prediction buffer
+									next_pb_pos[jj * strip_dim1_offset + kk] = pred;
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						unpredictable_count = block_unpredictable_count;
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						
+						reg_count ++;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						float * cur_pb_pos = pb_pos;
+						float * cur_data_pos = data_pos;
+						float curData;
+						float pred3D;
+						double itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									if(fabs(curData - mean) <= realPrecision){
+										// adjust type[index] to intvRadius for coherence with freq in reg
+										type[index] = intvRadius;
+										*cur_pb_pos = mean;
+									}
+									else
+									{
+										pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+												 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+										diff = curData - pred3D;
+										itvNum = fabs(diff)/realPrecision + 1;
+										if (itvNum < intvCapacity_sz){
+											if (diff < 0) itvNum = -itvNum;
+											type[index] = (int) (itvNum/2) + intvRadius;
+											*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+											if(type[index] <= intvRadius) type[index] -= 1;
+											//ganrantee comporession error against the case of machine-epsilon
+											if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+												type[index] = 0;
+												*cur_pb_pos = curData;	
+												unpredictable_data[unpredictable_count ++] = curData;
+											}					
+										}
+										else{
+											type[index] = 0;
+											*cur_pb_pos = curData;
+											unpredictable_data[unpredictable_count ++] = curData;
+										}
+									}
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = *cur_pb_pos;
+#endif																		
+									
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_pb_pos += strip_dim0_offset - current_blockcount_y * strip_dim1_offset;
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									if(fabs(curData - mean) <= realPrecision){
+										// adjust type[index] to intvRadius for coherence with freq in reg
+										type[index] = intvRadius;
+										*cur_pb_pos = mean;
+									}
+									else
+									{
+										pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+												 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+										diff = curData - pred3D;
+										itvNum = fabs(diff)/realPrecision + 1;
+										if (itvNum < intvCapacity_sz){
+											if (diff < 0) itvNum = -itvNum;
+											type[index] = (int) (itvNum/2) + intvRadius;
+											*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+											if(type[index] <= intvRadius) type[index] -= 1;
+											//ganrantee comporession error against the case of machine-epsilon
+											if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+												type[index] = 0;
+												*cur_pb_pos = curData;	
+												unpredictable_data[unpredictable_count ++] = curData;
+											}					
+										}
+										else{
+											type[index] = 0;
+											*cur_pb_pos = curData;
+											unpredictable_data[unpredictable_count ++] = curData;
+										}
+									}
+#ifdef HAVE_TIMECMPR
+									size_t ii = current_blockcount_x - 1;
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = *cur_pb_pos;
+#endif																		
+									
+									next_pb_pos[jj * strip_dim1_offset + kk] = *cur_pb_pos;
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						// change indicator
+						indicator_pos[k] = 1;
+					}// end SZ
+					
+					reg_params_pos ++;
+					data_pos += current_blockcount_z;
+					pb_pos += current_blockcount_z;
+					next_pb_pos += current_blockcount_z;
+					type += current_blockcount_x * current_blockcount_y * current_blockcount_z;
+
+				} // end k
+
+				if(strip_unpredictable_count > max_unpred_count){
+					max_unpred_count = strip_unpredictable_count;
+				}
+				total_unpred += strip_unpredictable_count;
+				indicator_pos += num_z;
+			}// end j
+			float * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}// end i
+	}
+	else{
+		int intvCapacity_sz = intvCapacity - 2;
+		for(size_t i=0; i<num_x; i++){
+			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+
+			for(size_t j=0; j<num_y; j++){
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+				data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset;
+				// copy bottom plane from plane buffer
+				// memcpy(prediction_buffer, bottom_buffer + offset_y * strip_dim1_offset, (current_blockcount_y + 1) * strip_dim1_offset * sizeof(float));
+				type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset;
+				type = result_type + type_offset;
+
+				// prediction buffer is (current_block_count_x + 1) * (current_block_count_y + 1) * (current_block_count_z + 1)
+				cur_pb_buf_pos = cur_pb_buf + offset_y * strip_dim1_offset + strip_dim0_offset + strip_dim1_offset + 1;
+				next_pb_buf_pos = next_pb_buf + offset_y * strip_dim1_offset + strip_dim1_offset + 1;
+
+				size_t current_blockcount_z;
+				float * pb_pos = cur_pb_buf_pos;
+				float * next_pb_pos = next_pb_buf_pos;
+				size_t strip_unpredictable_count = 0;
+				for(size_t k=0; k<num_z; k++){
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+#ifdef HAVE_TIMECMPR
+				size_t offset_z = 0;
+				offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+				size_t block_offset = offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+#endif														
+					/*sampling*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						float * cur_data_pos;
+						float curData;
+						float pred_reg, pred_sz;
+						float err_sz = 0.0, err_reg = 0.0;
+						int bmi;
+						if(i>0 && j>0 && k>0){
+							for(int i=0; i<block_size; i++){
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);
+
+								bmi = block_size - i;
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);
+							}
+						}
+						else{
+							for(int i=1; i<block_size; i++){
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);
+
+								bmi = block_size - i;
+								cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);								
+
+								cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+								curData = *cur_data_pos;
+								pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+								pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+								err_sz += fabs(pred_sz - curData) + noise;
+								err_reg += fabs(pred_reg - curData);
+							}
+						}
+						use_reg = (err_reg < err_sz);
+
+					}
+					if(use_reg)
+					{
+						{
+							/*predict coefficients in current block via previous reg_block*/
+							float cur_coeff;
+							double diff, itvNum;
+							for(int e=0; e<4; e++){
+								cur_coeff = reg_params_pos[e*num_blocks];
+								diff = cur_coeff - last_coeffcients[e];
+								itvNum = fabs(diff)/precision[e] + 1;
+								if (itvNum < coeff_intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+									last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+										coeff_type[e][coeff_index] = 0;
+										last_coeffcients[e] = cur_coeff;	
+										coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+									}					
+								}
+								else{
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}
+							}
+							coeff_index ++;
+						}
+						float curData;
+						float pred;
+						double itvNum;
+						double diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						float * cur_data_pos = data_pos;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = pred;
+#endif																		
+
+
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							size_t ii = current_blockcount_x - 1;
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+									
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = pred;
+#endif																											
+
+									if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+										// assign value to block surfaces
+										pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+									}
+									// assign value to next prediction buffer
+									next_pb_pos[jj * strip_dim1_offset + kk] = pred;
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						unpredictable_count = block_unpredictable_count;
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;						
+						reg_count ++;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						float * cur_pb_pos = pb_pos;
+						float * cur_data_pos = data_pos;
+						float curData;
+						float pred3D;
+						double itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+											 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+									diff = curData - pred3D;
+									itvNum = fabs(diff)/realPrecision + 1;
+									if (itvNum < intvCapacity_sz){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+											type[index] = 0;
+											*cur_pb_pos = curData;	
+											unpredictable_data[unpredictable_count ++] = curData;
+										}					
+									}
+									else{
+										type[index] = 0;
+										*cur_pb_pos = curData;
+										unpredictable_data[unpredictable_count ++] = curData;
+									}
+									
+#ifdef HAVE_TIMECMPR
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = *cur_pb_pos;
+#endif																											
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+							cur_pb_pos += strip_dim0_offset - current_blockcount_y * strip_dim1_offset;
+							cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						/*dealing with the last ii (boundary)*/
+						{
+							// ii == current_blockcount_x - 1
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+
+									curData = *cur_data_pos;
+									pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+											 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+									diff = curData - pred3D;
+									itvNum = fabs(diff)/realPrecision + 1;
+									if (itvNum < intvCapacity_sz){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+											type[index] = 0;
+											*cur_pb_pos = curData;	
+											unpredictable_data[unpredictable_count ++] = curData;
+										}					
+									}
+									else{
+										type[index] = 0;
+										*cur_pb_pos = curData;
+										unpredictable_data[unpredictable_count ++] = curData;
+									}
+									
+#ifdef HAVE_TIMECMPR
+									size_t ii = current_blockcount_x - 1;
+									size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+									if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+										decData[block_offset + point_offset] = *cur_pb_pos;
+#endif																											
+									
+									// assign value to next prediction buffer
+									next_pb_pos[jj * strip_dim1_offset + kk] = *cur_pb_pos;
+									index ++;
+									cur_pb_pos ++;
+									cur_data_pos ++;
+								}
+								cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+								cur_data_pos += dim1_offset - current_blockcount_z;
+							}
+						}
+						strip_unpredictable_count += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						// change indicator
+						indicator_pos[k] = 1;
+					}// end SZ
+					
+					reg_params_pos ++;
+					data_pos += current_blockcount_z;
+					pb_pos += current_blockcount_z;
+					next_pb_pos += current_blockcount_z;
+					type += current_blockcount_x * current_blockcount_y * current_blockcount_z;
+
+				}
+
+				if(strip_unpredictable_count > max_unpred_count){
+					max_unpred_count = strip_unpredictable_count;
+				}
+				total_unpred += strip_unpredictable_count;
+				indicator_pos += num_z;
+			}
+			float * tmp;
+			tmp = cur_pb_buf;
+			cur_pb_buf = next_pb_buf;
+			next_pb_buf = tmp;
+		}
+	}
+
+	free(prediction_buffer_1);
+	free(prediction_buffer_2);
+
+	int stateNum = 2*quantization_intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	size_t nodeCount = 0;
+	init(huffmanTree, result_type, num_elements);
+	size_t i = 0;
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	// total size 										metadata		  # elements     real precision		intervals	nodeCount		huffman 	 	block index 						unpredicatable count						mean 					 	unpred size 				elements
+	unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + treeByteSize + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(float) + total_unpred * sizeof(float) + num_elements * sizeof(int), 1);
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	
+	result_pos += meta_data_offset;
+	
+	sizeToBytes(result_pos,num_elements); //SZ_SIZE_TYPE: 4 or 8
+	result_pos += exe_params->SZ_SIZE_TYPE;
+
+	intToBytes_bigEndian(result_pos, block_size);
+	result_pos += sizeof(int);
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += sizeof(double);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += sizeof(int);
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+	free(treeBytes);
+
+	memcpy(result_pos, &use_mean, sizeof(unsigned char));
+	result_pos += sizeof(unsigned char);
+	memcpy(result_pos, &mean, sizeof(float));
+	result_pos += sizeof(float);
+	size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+	result_pos += indicator_size;
+	
+	//convert the lead/mid/resi to byte stream
+	if(reg_count > 0){
+		for(int e=0; e<4; e++){
+			int stateNum = 2*coeff_intvCapacity_sz;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+			size_t nodeCount = 0;
+			init(huffmanTree, coeff_type[e], reg_count);
+			size_t i = 0;
+			for (i = 0; i < huffmanTree->stateNum; i++)
+				if (huffmanTree->code[i]) nodeCount++; 
+			nodeCount = nodeCount*2-1;
+			unsigned char *treeBytes;
+			unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+			doubleToBytes(result_pos, precision[e]);
+			result_pos += sizeof(double);
+			intToBytes_bigEndian(result_pos, coeff_intvRadius);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, treeByteSize);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, nodeCount);
+			result_pos += sizeof(int);
+			memcpy(result_pos, treeBytes, treeByteSize);		
+			result_pos += treeByteSize;
+			free(treeBytes);
+			size_t typeArray_size = 0;
+			encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+			sizeToBytes(result_pos, typeArray_size);
+			result_pos += sizeof(size_t) + typeArray_size;
+			intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+			result_pos += sizeof(int);
+			memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(float));
+			result_pos += coeff_unpredictable_count[e]*sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	free(coeff_result_type);
+	free(coeff_unpredictable_data);
+	
+	//record the number of unpredictable data and also store them
+	memcpy(result_pos, &total_unpred, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(float));
+	result_pos += total_unpred * sizeof(float);
+	size_t typeArray_size = 0;
+	encode(huffmanTree, result_type, num_elements, result_pos, &typeArray_size);
+	result_pos += typeArray_size;
+	size_t totalEncodeSize = result_pos - result;
+	free(indicator);
+	free(result_unpredictable_data);
+	free(result_type);
+	free(reg_params);
+
+	
+	SZ_ReleaseHuffman(huffmanTree);
+	*comp_size = totalEncodeSize;
+	return result;
+}
+
+unsigned char * SZ_compress_float_3D_MDQ_random_access_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size){
+
+	unsigned int quantization_intervals;
+	float sz_sample_correct_freq = -1;//0.5; //-1
+	float dense_pos;
+	float mean_flush_freq;
+	unsigned char use_mean = 0;
+
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	size_t block_size = 6;
+	num_x = (r1 - 1) / block_size + 1;
+	num_y = (r2 - 1) / block_size + 1;
+	num_z = (r3 - 1) / block_size + 1;
+
+	size_t max_num_block_elements = block_size * block_size * block_size;
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t num_elements = r1 * r2 * r3;
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;	
+
+	int * result_type = (int *) malloc(num_blocks*max_num_block_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+	size_t total_unpred = 0;
+	size_t unpredictable_count;
+	float * data_pos = oriData;
+	int * type = result_type;
+	float * reg_params = (float *) malloc(num_blocks * 4 * sizeof(float));
+	float * reg_params_pos = reg_params;
+	// move regression part out
+	size_t params_offset_b = num_blocks;
+	size_t params_offset_c = 2*num_blocks;
+	size_t params_offset_d = 3*num_blocks;
+	float * pred_buffer = (float *) malloc((block_size+1)*(block_size+1)*(block_size+1)*sizeof(float));
+	float * pred_buffer_pos = NULL;
+	float * block_data_pos_x = NULL;
+	float * block_data_pos_y = NULL;
+	float * block_data_pos_z = NULL;
+	for(size_t i=0; i<num_x; i++){
+		for(size_t j=0; j<num_y; j++){
+			for(size_t k=0; k<num_z; k++){
+				data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+				pred_buffer_pos = pred_buffer;
+				block_data_pos_x = data_pos;
+				// use the buffer as block_size*block_size*block_size
+				for(int ii=0; ii<block_size; ii++){
+					block_data_pos_y = block_data_pos_x;
+					for(int jj=0; jj<block_size; jj++){
+						block_data_pos_z = block_data_pos_y;
+						for(int kk=0; kk<block_size; kk++){
+							*pred_buffer_pos = *block_data_pos_z;
+							if(k*block_size + kk + 1 < r3) block_data_pos_z ++;
+							pred_buffer_pos ++;
+						}
+						if(j*block_size + jj + 1 < r2) block_data_pos_y += dim1_offset;
+					}
+					if(i*block_size + ii + 1 < r1) block_data_pos_x += dim0_offset;
+				}
+				/*Calculate regression coefficients*/
+				{
+					float * cur_data_pos = pred_buffer;
+					float fx = 0.0;
+					float fy = 0.0;
+					float fz = 0.0;
+					float f = 0;
+					float sum_x, sum_y; 
+					float curData;
+					for(size_t i=0; i<block_size; i++){
+						sum_x = 0;
+						for(size_t j=0; j<block_size; j++){
+							sum_y = 0;
+							for(size_t k=0; k<block_size; k++){
+								curData = *cur_data_pos;
+								sum_y += curData;
+								fz += curData * k;
+								cur_data_pos ++;
+							}
+							fy += sum_y * j;
+							sum_x += sum_y;
+						}
+						fx += sum_x * i;
+						f += sum_x;
+					}
+					float coeff = 1.0 / (block_size * block_size * block_size);
+					reg_params_pos[0] = (2 * fx / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+					reg_params_pos[params_offset_b] = (2 * fy / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+					reg_params_pos[params_offset_c] = (2 * fz / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+					reg_params_pos[params_offset_d] = f * coeff - ((block_size - 1) * reg_params_pos[0] / 2 + (block_size - 1) * reg_params_pos[params_offset_b] / 2 + (block_size - 1) * reg_params_pos[params_offset_c] / 2);
+				}
+				reg_params_pos ++;
+			}
+		}
+	}
+	
+	//Compress coefficient arrays
+	double precision_a, precision_b, precision_c, precision_d;
+	float rel_param_err = 0.025;
+	precision_a = rel_param_err * realPrecision / block_size;
+	precision_b = rel_param_err * realPrecision / block_size;
+	precision_c = rel_param_err * realPrecision / block_size;
+	precision_d = rel_param_err * realPrecision;
+
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_3D_with_freq_and_dense_pos(oriData, r1, r2, r3, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+		if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+
+	float mean = 0;
+	if(use_mean){
+		// compute mean
+		double sum = 0.0;
+		size_t mean_count = 0;
+		for(size_t i=0; i<num_elements; i++){
+			if(fabs(oriData[i] - dense_pos) < realPrecision){
+				sum += oriData[i];
+				mean_count ++;
+			}
+		}
+		if(mean_count > 0) mean = sum / mean_count;
+	}
+
+	double tmp_realPrecision = realPrecision;
+
+	// use two prediction buffers for higher performance
+	float * unpredictable_data = result_unpredictable_data;
+	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+	memset(indicator, 0, num_blocks * sizeof(unsigned char));
+	size_t reg_count = 0;
+	unsigned char * indicator_pos = indicator;
+
+	int intvCapacity = exe_params->intvCapacity;
+	int intvRadius = exe_params->intvRadius;	
+	int use_reg = 0;
+	float noise = realPrecision * 1.22;
+
+	reg_params_pos = reg_params;
+	// compress the regression coefficients on the fly
+	float last_coeffcients[4] = {0.0};
+	int coeff_intvCapacity_sz = 65536;
+	int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+	int * coeff_type[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	float * coeff_unpred_data[4];
+	float * coeff_unpredictable_data = (float *) malloc(num_blocks*4*sizeof(float));
+	double precision[4];
+	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c, precision[3] = precision_d;
+	for(int i=0; i<4; i++){
+		coeff_type[i] = coeff_result_type + i * num_blocks;
+		coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
+	}
+	int coeff_index = 0;
+	unsigned int coeff_unpredictable_count[4] = {0};
+
+	memset(pred_buffer, 0, (block_size+1)*(block_size+1)*(block_size+1)*sizeof(float));
+	int pred_buffer_block_size = block_size + 1;
+	int strip_dim0_offset = pred_buffer_block_size * pred_buffer_block_size;
+	int strip_dim1_offset = pred_buffer_block_size;
+
+	if(use_mean){
+		int intvCapacity_sz = intvCapacity - 2;
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					// add 1 in x, y, z offset
+					pred_buffer_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+					block_data_pos_x = data_pos;
+					for(int ii=0; ii<block_size; ii++){
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								*pred_buffer_pos = *block_data_pos_z;
+								if(k*block_size + kk + 1< r3) block_data_pos_z ++;
+								pred_buffer_pos ++;
+							}
+							// add 1 in z offset
+							pred_buffer_pos ++;
+							if(j*block_size + jj + 1< r2) block_data_pos_y += dim1_offset;
+						}
+						// add 1 in y offset
+						pred_buffer_pos += pred_buffer_block_size;
+						if(i*block_size + ii + 1< r1) block_data_pos_x += dim0_offset;
+					}
+					/*sampling and decide which predictor*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						float * cur_data_pos;
+						float curData;
+						float pred_reg, pred_sz;
+						float err_sz = 0.0, err_reg = 0.0;
+						int bmi = 0;
+						for(int i=2; i<=block_size; i++){
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);
+
+							bmi = block_size - i;
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + bmi*pred_buffer_block_size + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + bmi*pred_buffer_block_size + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+							err_reg += fabs(pred_reg - curData);
+						}
+						
+						use_reg = (err_reg < err_sz);
+					}
+					if(use_reg){
+						{
+							/*predict coefficients in current block via previous reg_block*/
+							float cur_coeff;
+							double diff, itvNum;
+							for(int e=0; e<4; e++){
+								cur_coeff = reg_params_pos[e*num_blocks];
+								diff = cur_coeff - last_coeffcients[e];
+								itvNum = fabs(diff)/precision[e] + 1;
+								if (itvNum < coeff_intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+									last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+										coeff_type[e][coeff_index] = 0;
+										last_coeffcients[e] = cur_coeff;	
+										coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+									}					
+								}
+								else{
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}
+							}
+							coeff_index ++;
+						}
+						float curData;
+						float pred;
+						double itvNum;
+						double diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos ++;
+							}
+							cur_data_pos += pred_buffer_block_size;
+						}
+						
+						total_unpred += block_unpredictable_count;
+						unpredictable_data += block_unpredictable_count;						
+						reg_count ++;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+						float curData;
+						float pred3D;
+						double itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+
+									curData = *cur_data_pos;
+									if(fabs(curData - mean) <= realPrecision){
+										type[index] = 1;
+										*cur_data_pos = mean;
+									}
+									else
+									{
+										pred3D = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1]
+												 - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+										diff = curData - pred3D;
+										itvNum = fabs(diff)/realPrecision + 1;
+										if (itvNum < intvCapacity_sz){
+											if (diff < 0) itvNum = -itvNum;
+											type[index] = (int) (itvNum/2) + intvRadius;
+											*cur_data_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+											//ganrantee comporession error against the case of machine-epsilon
+											if(fabs(curData - *cur_data_pos)>tmp_realPrecision){	
+												type[index] = 0;
+												*cur_data_pos = curData;	
+												unpredictable_data[unpredictable_count ++] = curData;
+											}					
+										}
+										else{
+											type[index] = 0;
+											*cur_data_pos = curData;
+											unpredictable_data[unpredictable_count ++] = curData;
+										}
+									}
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos ++;
+							}
+							cur_data_pos += pred_buffer_block_size;
+						}
+						total_unpred += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						// change indicator
+						indicator_pos[k] = 1;
+					}// end SZ
+					reg_params_pos ++;
+					type += block_size * block_size * block_size;
+				} // end k
+				indicator_pos += num_z;
+			}// end j
+		}// end i
+	}
+	else{
+		int intvCapacity_sz = intvCapacity - 2;
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					// add 1 in x, y, z offset
+					pred_buffer_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+					block_data_pos_x = data_pos;
+					for(int ii=0; ii<block_size; ii++){
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								*pred_buffer_pos = *block_data_pos_z;
+								if(k*block_size + kk < r3) block_data_pos_z ++;
+								pred_buffer_pos ++;
+							}
+							// add 1 in z offset
+							pred_buffer_pos ++;
+							if(j*block_size + jj < r2) block_data_pos_y += dim1_offset;
+						}
+						// add 1 in y offset
+						pred_buffer_pos += pred_buffer_block_size;
+						if(i*block_size + ii < r1) block_data_pos_x += dim0_offset;
+					}
+					/*sampling*/
+					{
+						// sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+						float * cur_data_pos;
+						float curData;
+						float pred_reg, pred_sz;
+						float err_sz = 0.0, err_reg = 0.0;
+						int bmi;
+						for(int i=2; i<=block_size; i++){
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);
+
+							bmi = block_size - i;
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + bmi*pred_buffer_block_size + i;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);								
+
+							cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + bmi*pred_buffer_block_size + bmi;
+							curData = *cur_data_pos;
+							pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+							pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];							
+							err_sz += fabs(pred_sz - curData) + noise;
+							err_reg += fabs(pred_reg - curData);
+						}
+						
+						use_reg = (err_reg < err_sz);
+
+					}
+					if(use_reg)
+					{
+						{
+							/*predict coefficients in current block via previous reg_block*/
+							float cur_coeff;
+							double diff, itvNum;
+							for(int e=0; e<4; e++){
+								cur_coeff = reg_params_pos[e*num_blocks];
+								diff = cur_coeff - last_coeffcients[e];
+								itvNum = fabs(diff)/precision[e] + 1;
+								if (itvNum < coeff_intvCapacity_sz){
+									if (diff < 0) itvNum = -itvNum;
+									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+									last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+									//ganrantee comporession error against the case of machine-epsilon
+									if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){	
+										coeff_type[e][coeff_index] = 0;
+										last_coeffcients[e] = cur_coeff;	
+										coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+									}					
+								}
+								else{
+									coeff_type[e][coeff_index] = 0;
+									last_coeffcients[e] = cur_coeff;
+									coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+								}
+							}
+							coeff_index ++;
+						}
+						float curData;
+						float pred;
+						double itvNum;
+						double diff;
+						size_t index = 0;
+						size_t block_unpredictable_count = 0;
+						float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									curData = *cur_data_pos;
+									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
+									diff = curData - pred;
+									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									if (itvNum < intvCapacity){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - pred)>tmp_realPrecision){	
+											type[index] = 0;
+											pred = curData;
+											unpredictable_data[block_unpredictable_count ++] = curData;
+										}		
+									}
+									else{
+										type[index] = 0;
+										pred = curData;
+										unpredictable_data[block_unpredictable_count ++] = curData;
+									}
+									index ++;	
+									cur_data_pos ++;
+								}
+								cur_data_pos ++;
+							}
+							cur_data_pos += pred_buffer_block_size;
+						}
+						total_unpred += block_unpredictable_count;
+						unpredictable_data += block_unpredictable_count;						
+						reg_count ++;
+					}
+					else{
+						// use SZ
+						// SZ predication
+						unpredictable_count = 0;
+						float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+						float curData;
+						float pred3D;
+						double itvNum, diff;
+						size_t index = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									curData = *cur_data_pos;
+									pred3D = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1]
+											 - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+									diff = curData - pred3D;
+									itvNum = fabs(diff)/realPrecision + 1;
+									if (itvNum < intvCapacity_sz){
+										if (diff < 0) itvNum = -itvNum;
+										type[index] = (int) (itvNum/2) + intvRadius;
+										*cur_data_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										//ganrantee comporession error against the case of machine-epsilon
+										if(fabs(curData - *cur_data_pos)>tmp_realPrecision){	
+											type[index] = 0;
+											*cur_data_pos = curData;	
+											unpredictable_data[unpredictable_count ++] = curData;
+										}					
+									}
+									else{
+										type[index] = 0;
+										*cur_data_pos = curData;
+										unpredictable_data[unpredictable_count ++] = curData;
+									}
+									index ++;
+									cur_data_pos ++;
+								}
+								cur_data_pos ++;
+							}
+							cur_data_pos += pred_buffer_block_size;
+						}
+						total_unpred += unpredictable_count;
+						unpredictable_data += unpredictable_count;
+						// change indicator
+						indicator_pos[k] = 1;
+					}// end SZ					
+					reg_params_pos ++;
+					type += block_size * block_size * block_size;
+				}
+				indicator_pos += num_z;
+			}
+		}
+	}
+	free(pred_buffer);
+	int stateNum = 2*quantization_intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	size_t nodeCount = 0;
+	init(huffmanTree, result_type, num_blocks*max_num_block_elements);
+	size_t i = 0;
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	// total size 										metadata		  # elements     real precision		intervals	nodeCount		huffman 	 	block index 						unpredicatable count						mean 					 	unpred size 				elements
+	unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + treeByteSize + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(float) + total_unpred * sizeof(float) + num_elements * sizeof(int), 1);
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	
+	result_pos += meta_data_offset;
+	
+	sizeToBytes(result_pos,num_elements); //SZ_SIZE_TYPE: 4 or 8
+	result_pos += exe_params->SZ_SIZE_TYPE;
+
+	intToBytes_bigEndian(result_pos, block_size);
+	result_pos += sizeof(int);
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += sizeof(double);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += sizeof(int);
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += sizeof(int);
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+	free(treeBytes);
+
+	memcpy(result_pos, &use_mean, sizeof(unsigned char));
+	result_pos += sizeof(unsigned char);
+	memcpy(result_pos, &mean, sizeof(float));
+	result_pos += sizeof(float);
+	size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+	result_pos += indicator_size;
+	
+	//convert the lead/mid/resi to byte stream
+	if(reg_count > 0){
+		for(int e=0; e<4; e++){
+			int stateNum = 2*coeff_intvCapacity_sz;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+			size_t nodeCount = 0;
+			init(huffmanTree, coeff_type[e], reg_count);
+			size_t i = 0;
+			for (i = 0; i < huffmanTree->stateNum; i++)
+				if (huffmanTree->code[i]) nodeCount++; 
+			nodeCount = nodeCount*2-1;
+			unsigned char *treeBytes;
+			unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+			doubleToBytes(result_pos, precision[e]);
+			result_pos += sizeof(double);
+			intToBytes_bigEndian(result_pos, coeff_intvRadius);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, treeByteSize);
+			result_pos += sizeof(int);
+			intToBytes_bigEndian(result_pos, nodeCount);
+			result_pos += sizeof(int);
+			memcpy(result_pos, treeBytes, treeByteSize);		
+			result_pos += treeByteSize;
+			free(treeBytes);
+			size_t typeArray_size = 0;
+			encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+			sizeToBytes(result_pos, typeArray_size);
+			result_pos += sizeof(size_t) + typeArray_size;
+			intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+			result_pos += sizeof(int);
+			memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(float));
+			result_pos += coeff_unpredictable_count[e]*sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	free(coeff_result_type);
+	free(coeff_unpredictable_data);
+	
+	//record the number of unpredictable data and also store them
+	memcpy(result_pos, &total_unpred, sizeof(size_t));
+	result_pos += sizeof(size_t);
+	memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(float));
+	result_pos += total_unpred * sizeof(float);
+	size_t typeArray_size = 0;
+	encode(huffmanTree, result_type, num_blocks*max_num_block_elements, result_pos, &typeArray_size);
+	result_pos += typeArray_size;
+	size_t totalEncodeSize = result_pos - result;
+	free(indicator);
+	free(result_unpredictable_data);
+	free(result_type);
+	free(reg_params);
+
+	
+	SZ_ReleaseHuffman(huffmanTree);
+	*comp_size = totalEncodeSize;
+	return result;
+}
diff --git a/thirdparty/SZ/sz/src/sz_float_pwr.c b/thirdparty/SZ/sz/src/sz_float_pwr.c
index 644afddf46bf707abc6c500a8e1ba96b09020c5e..92a449f5e934a62eb05b55573f8002048292a648 100644
--- a/thirdparty/SZ/sz/src/sz_float_pwr.c
+++ b/thirdparty/SZ/sz/src/sz_float_pwr.c
@@ -23,6 +23,7 @@
 #include "sz_float_pwr.h"
 #include "zlib.h"
 #include "rw.h"
+#include "utility.h"
 
 void compute_segment_precisions_float_1D(float *oriData, size_t dataLength, float* pwrErrBound, unsigned char* pwrErrBoundBytes, double globalPrecision)
 {
@@ -1780,3 +1781,190 @@ size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio,
 
         free_TightDataPointStorageF(tdps);
 }
+
+#include <stdbool.h>
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, float min, float max){
+
+	float * log_data = (float *) malloc(dataLength * sizeof(float));
+
+	unsigned char * signs = (unsigned char *) malloc(dataLength);
+	memset(signs, 0, dataLength);
+	// preprocess
+	float max_abs_log_data;
+    if(min == 0) max_abs_log_data = fabs(log2(fabs(max)));
+    else if(max == 0) max_abs_log_data = fabs(log2(fabs(min)));
+    else max_abs_log_data = fabs(log2(fabs(min))) > fabs(log2(fabs(max))) ? fabs(log2(fabs(min))) : fabs(log2(fabs(max)));
+    float min_log_data = max_abs_log_data;
+	bool positive = true;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] < 0){
+			signs[i] = 1;
+			log_data[i] = -oriData[i];
+			positive = false;
+		}
+		else
+			log_data[i] = oriData[i];
+		if(log_data[i] > 0){
+			log_data[i] = log2(log_data[i]);
+			if(log_data[i] > max_abs_log_data) max_abs_log_data = log_data[i];
+			if(log_data[i] < min_log_data) min_log_data = log_data[i];
+		}
+	}
+
+	float valueRangeSize, medianValue_f;
+	computeRangeSize_float(log_data, dataLength, &valueRangeSize, &medianValue_f);	
+	if(fabs(min_log_data) > max_abs_log_data) max_abs_log_data = fabs(min_log_data);
+	double realPrecision = log2(1.0 + pwrErrRatio) - max_abs_log_data * 1.2e-7;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			log_data[i] = min_log_data - 2.0001*realPrecision;
+		}
+	}
+
+    TightDataPointStorageF* tdps = SZ_compress_float_1D_MDQ(log_data, dataLength, realPrecision, valueRangeSize, medianValue_f);
+    tdps->minLogValue = min_log_data - 1.0001*realPrecision;
+    free(log_data);
+    if(!positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+    if(*outSize>dataLength*sizeof(float))
+            SZ_compress_args_float_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+
+    free_TightDataPointStorageF(tdps);
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t *outSize, float min, float max){
+
+	size_t dataLength = r1 * r2;
+	float * log_data = (float *) malloc(dataLength * sizeof(float));
+
+	unsigned char * signs = (unsigned char *) malloc(dataLength);
+	memset(signs, 0, dataLength);
+	// preprocess
+	float max_abs_log_data;
+    if(min == 0) max_abs_log_data = fabs(log2(fabs(max)));
+    else if(max == 0) max_abs_log_data = fabs(log2(fabs(min)));
+    else max_abs_log_data = fabs(log2(fabs(min))) > fabs(log2(fabs(max))) ? fabs(log2(fabs(min))) : fabs(log2(fabs(max)));
+    float min_log_data = max_abs_log_data;
+	bool positive = true;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] < 0){
+			signs[i] = 1;
+			log_data[i] = -oriData[i];
+			positive = false;
+		}
+		else
+			log_data[i] = oriData[i];
+		if(log_data[i] > 0){
+			log_data[i] = log2(log_data[i]);
+			if(log_data[i] > max_abs_log_data) max_abs_log_data = log_data[i];
+			if(log_data[i] < min_log_data) min_log_data = log_data[i];
+		}
+	}
+
+	float valueRangeSize, medianValue_f;
+	computeRangeSize_float(log_data, dataLength, &valueRangeSize, &medianValue_f);	
+	if(fabs(min_log_data) > max_abs_log_data) max_abs_log_data = fabs(min_log_data);
+	double realPrecision = log2(1.0 + pwrErrRatio) - max_abs_log_data * 1.2e-7;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			log_data[i] = min_log_data - 2.0001*realPrecision;
+		}
+	}
+
+    TightDataPointStorageF* tdps = SZ_compress_float_2D_MDQ(log_data, r1, r2, realPrecision, valueRangeSize, medianValue_f);
+    tdps->minLogValue = min_log_data - 1.0001*realPrecision;
+    free(log_data);
+    if(!positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+    if(*outSize>dataLength*sizeof(float))
+            SZ_compress_args_float_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+
+    free_TightDataPointStorageF(tdps);
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t r3, size_t *outSize, float min, float max){
+
+	size_t dataLength = r1 * r2 * r3;
+	float * log_data = (float *) malloc(dataLength * sizeof(float));
+
+	unsigned char * signs = (unsigned char *) malloc(dataLength);
+	memset(signs, 0, dataLength);
+	// preprocess
+	float max_abs_log_data;
+    if(min == 0) max_abs_log_data = fabs(log2(fabs(max)));
+    else if(max == 0) max_abs_log_data = fabs(log2(fabs(min)));
+    else max_abs_log_data = fabs(log2(fabs(min))) > fabs(log2(fabs(max))) ? fabs(log2(fabs(min))) : fabs(log2(fabs(max)));
+    float min_log_data = max_abs_log_data;
+	bool positive = true;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] < 0){
+			signs[i] = 1;
+			log_data[i] = -oriData[i];
+			positive = false;
+		}
+		else
+			log_data[i] = oriData[i];
+		if(log_data[i] > 0){
+			log_data[i] = log2(log_data[i]);
+			if(log_data[i] > max_abs_log_data) max_abs_log_data = log_data[i];
+			if(log_data[i] < min_log_data) min_log_data = log_data[i];
+		}
+	}
+
+	float valueRangeSize, medianValue_f;
+	computeRangeSize_float(log_data, dataLength, &valueRangeSize, &medianValue_f);	
+	if(fabs(min_log_data) > max_abs_log_data) max_abs_log_data = fabs(min_log_data);
+	double realPrecision = log2(1.0 + pwrErrRatio) - max_abs_log_data * 1.2e-7;
+	for(size_t i=0; i<dataLength; i++){
+		if(oriData[i] == 0){
+			log_data[i] = min_log_data - 2.0001*realPrecision;
+		}
+	}
+
+    TightDataPointStorageF* tdps = SZ_compress_float_3D_MDQ(log_data, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
+    tdps->minLogValue = min_log_data - 1.0001*realPrecision;
+    free(log_data);
+    if(!positive){
+	    unsigned char * comp_signs;
+		// compress signs
+		unsigned long signSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, signs, dataLength, &comp_signs);
+		tdps->pwrErrBoundBytes = comp_signs;
+		tdps->pwrErrBoundBytes_size = signSize;
+	}
+	else{
+		tdps->pwrErrBoundBytes = NULL;
+		tdps->pwrErrBoundBytes_size = 0;
+	}
+	free(signs);
+
+    convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+    if(*outSize>dataLength*sizeof(float))
+            SZ_compress_args_float_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+
+    free_TightDataPointStorageF(tdps);
+}
diff --git a/thirdparty/SZ/sz/src/sz_int16.c b/thirdparty/SZ/sz/src/sz_int16.c
index fc91dd1a6f64af0d19c14540be9ecf57bf4bb443..82262349a48eda8f11e0470dc54ba01f483e60fa 100644
--- a/thirdparty/SZ/sz/src/sz_int16.c
+++ b/thirdparty/SZ/sz/src/sz_int16.c
@@ -21,6 +21,7 @@
 #include "rw.h"
 #include "TightDataPointStorageI.h"
 #include "sz_int16.h"
+#include "utility.h"
 
 unsigned int optimize_intervals_int16_1D(int16_t *oriData, size_t dataLength, double realPrecision)
 {	
@@ -266,7 +267,7 @@ TightDataPointStorageI* SZ_compress_int16_1D_MDQ(int16_t *oriData, size_t dataLe
 		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
 		pred = last3CmprsData[0];
 		predAbsErr = llabs(curData - pred);	
-		if(predAbsErr<=checkRadius)
+		if(predAbsErr<checkRadius)
 		{
 			state = (predAbsErr/realPrecision+1)/2;
 			if(curData>=pred)
@@ -1369,7 +1370,7 @@ int errBoundMode, double absErr_Bound, double relBoundRatio)
 		}
 		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
 		{
-			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
 			free(tmpByteData);
 		}
 		else
diff --git a/thirdparty/SZ/sz/src/sz_int32.c b/thirdparty/SZ/sz/src/sz_int32.c
index bcd97834e18f1a9844a49549a2b10e73be63d67e..6c5a66eeddc8b5466dda4de49da01ddefd575e6a 100644
--- a/thirdparty/SZ/sz/src/sz_int32.c
+++ b/thirdparty/SZ/sz/src/sz_int32.c
@@ -21,6 +21,7 @@
 #include "rw.h"
 #include "TightDataPointStorageI.h"
 #include "sz_int32.h"
+#include "utility.h"
 
 unsigned int optimize_intervals_int32_1D(int32_t *oriData, size_t dataLength, double realPrecision)
 {	
@@ -268,7 +269,7 @@ TightDataPointStorageI* SZ_compress_int32_1D_MDQ(int32_t *oriData, size_t dataLe
 		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
 		pred = last3CmprsData[0];
 		predAbsErr = llabs(curData - pred);	
-		if(predAbsErr<=checkRadius)
+		if(predAbsErr<checkRadius)
 		{
 			state = (predAbsErr/realPrecision+1)/2;
 			if(curData>=pred)
@@ -1253,7 +1254,7 @@ int errBoundMode, double absErr_Bound, double relBoundRatio)
 		}
 		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
 		{
-			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
 			free(tmpByteData);
 		}
 		else
diff --git a/thirdparty/SZ/sz/src/sz_int64.c b/thirdparty/SZ/sz/src/sz_int64.c
index eb973775aa9cf1565e92797217b96aa102a678ae..065fb16e49dcd2e68a546610a85d9b3f17c44154 100644
--- a/thirdparty/SZ/sz/src/sz_int64.c
+++ b/thirdparty/SZ/sz/src/sz_int64.c
@@ -21,6 +21,7 @@
 #include "rw.h"
 #include "TightDataPointStorageI.h"
 #include "sz_int64.h"
+#include "utility.h"
 
 unsigned int optimize_intervals_int64_1D(int64_t *oriData, size_t dataLength, double realPrecision)
 {	
@@ -269,7 +270,7 @@ TightDataPointStorageI* SZ_compress_int64_1D_MDQ(int64_t *oriData, size_t dataLe
 		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
 		pred = last3CmprsData[0];
 		predAbsErr = llabs(curData - pred);	
-		if(predAbsErr<=checkRadius)
+		if(predAbsErr<checkRadius)
 		{
 			state = (predAbsErr/realPrecision+1)/2;
 			if(curData>=pred)
@@ -1254,7 +1255,7 @@ int errBoundMode, double absErr_Bound, double relBoundRatio)
 		}
 		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
 		{
-			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
 			free(tmpByteData);
 		}
 		else
diff --git a/thirdparty/SZ/sz/src/sz_int8.c b/thirdparty/SZ/sz/src/sz_int8.c
index c869681022f0bd8872b4dbfb632df616c77aa4b9..83febd0de64b14be3915a9fa81e5fdb907345fe6 100644
--- a/thirdparty/SZ/sz/src/sz_int8.c
+++ b/thirdparty/SZ/sz/src/sz_int8.c
@@ -21,6 +21,7 @@
 #include "rw.h"
 #include "TightDataPointStorageI.h"
 #include "sz_int8.h"
+#include "utility.h"
 
 unsigned int optimize_intervals_int8_1D(int8_t *oriData, size_t dataLength, double realPrecision)
 {	
@@ -266,7 +267,7 @@ TightDataPointStorageI* SZ_compress_int8_1D_MDQ(int8_t *oriData, size_t dataLeng
 		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
 		pred = last3CmprsData[0];
 		predAbsErr = llabs(curData - pred);	
-		if(predAbsErr<=checkRadius)
+		if(predAbsErr<checkRadius)
 		{
 			state = (predAbsErr/realPrecision+1)/2;
 			if(curData>=pred)
@@ -1370,7 +1371,7 @@ int errBoundMode, double absErr_Bound, double relBoundRatio)
 		}
 		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
 		{
-			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
 			free(tmpByteData);
 		}
 		else
diff --git a/thirdparty/SZ/sz/src/sz_uint16.c b/thirdparty/SZ/sz/src/sz_uint16.c
index 4200b31651216119d09dee8d81b385d70d498ef5..b55f00726db09c68edcfbd89c9681cd23fdfa728 100644
--- a/thirdparty/SZ/sz/src/sz_uint16.c
+++ b/thirdparty/SZ/sz/src/sz_uint16.c
@@ -21,6 +21,7 @@
 #include "rw.h"
 #include "TightDataPointStorageI.h"
 #include "sz_uint16.h"
+#include "utility.h"
 
 unsigned int optimize_intervals_uint16_1D(uint16_t *oriData, size_t dataLength, double realPrecision)
 {	
@@ -266,7 +267,7 @@ TightDataPointStorageI* SZ_compress_uint16_1D_MDQ(uint16_t *oriData, size_t data
 		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
 		pred = last3CmprsData[0];
 		predAbsErr = llabs(curData - pred);	
-		if(predAbsErr<=checkRadius)
+		if(predAbsErr<checkRadius)
 		{
 			state = (predAbsErr/realPrecision+1)/2;
 			if(curData>=pred)
@@ -1369,7 +1370,7 @@ int errBoundMode, double absErr_Bound, double relBoundRatio)
 		}
 		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
 		{
-			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
 			free(tmpByteData);
 		}
 		else
diff --git a/thirdparty/SZ/sz/src/sz_uint32.c b/thirdparty/SZ/sz/src/sz_uint32.c
index 29d596165910a4fd476179d73f1d917706a4c1d5..6f27510f258fc43388b310808e82ad8c50d4b772 100644
--- a/thirdparty/SZ/sz/src/sz_uint32.c
+++ b/thirdparty/SZ/sz/src/sz_uint32.c
@@ -21,6 +21,7 @@
 #include "rw.h"
 #include "TightDataPointStorageI.h"
 #include "sz_uint32.h"
+#include "utility.h"
 
 unsigned int optimize_intervals_uint32_1D(uint32_t *oriData, size_t dataLength, double realPrecision)
 {	
@@ -268,7 +269,7 @@ TightDataPointStorageI* SZ_compress_uint32_1D_MDQ(uint32_t *oriData, size_t data
 		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
 		pred = last3CmprsData[0];
 		predAbsErr = llabs(curData - pred);	
-		if(predAbsErr<=checkRadius)
+		if(predAbsErr<checkRadius)
 		{
 			state = (predAbsErr/realPrecision+1)/2;
 			if(curData>=pred)
@@ -1253,7 +1254,7 @@ int errBoundMode, double absErr_Bound, double relBoundRatio)
 		}
 		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
 		{
-			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
 			free(tmpByteData);
 		}
 		else
diff --git a/thirdparty/SZ/sz/src/sz_uint64.c b/thirdparty/SZ/sz/src/sz_uint64.c
index b8cb8bc3d1e588ff8af9e91b8036ee83a332f274..7d2eca843f9205a5b3704d49e5da64a67f315fb9 100644
--- a/thirdparty/SZ/sz/src/sz_uint64.c
+++ b/thirdparty/SZ/sz/src/sz_uint64.c
@@ -21,6 +21,7 @@
 #include "rw.h"
 #include "TightDataPointStorageI.h"
 #include "sz_uint64.h"
+#include "utility.h"
 
 unsigned int optimize_intervals_uint64_1D(uint64_t *oriData, size_t dataLength, double realPrecision)
 {	
@@ -268,7 +269,7 @@ TightDataPointStorageI* SZ_compress_uint64_1D_MDQ(uint64_t *oriData, size_t data
 		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
 		pred = last3CmprsData[0];
 		predAbsErr = llabs(curData - pred);	
-		if(predAbsErr<=checkRadius)
+		if(predAbsErr<checkRadius)
 		{
 			state = (predAbsErr/realPrecision+1)/2;
 			if(curData>=pred)
@@ -1253,7 +1254,7 @@ int errBoundMode, double absErr_Bound, double relBoundRatio)
 		}
 		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
 		{
-			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
 			free(tmpByteData);
 		}
 		else
diff --git a/thirdparty/SZ/sz/src/sz_uint8.c b/thirdparty/SZ/sz/src/sz_uint8.c
index 6ca4ae48a3bd7a12f6ea1342ea7ffabc8a9b3f1d..6865564dd9e8304de3bf973227541775e13b80ea 100644
--- a/thirdparty/SZ/sz/src/sz_uint8.c
+++ b/thirdparty/SZ/sz/src/sz_uint8.c
@@ -21,6 +21,7 @@
 #include "rw.h"
 #include "TightDataPointStorageI.h"
 #include "sz_uint8.h"
+#include "utility.h"
 
 unsigned int optimize_intervals_uint8_1D(uint8_t *oriData, size_t dataLength, double realPrecision)
 {	
@@ -266,7 +267,7 @@ TightDataPointStorageI* SZ_compress_uint8_1D_MDQ(uint8_t *oriData, size_t dataLe
 		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
 		pred = last3CmprsData[0];
 		predAbsErr = llabs(curData - pred);	
-		if(predAbsErr<=checkRadius)
+		if(predAbsErr<checkRadius)
 		{
 			state = (predAbsErr/realPrecision+1)/2;
 			if(curData>=pred)
@@ -1370,7 +1371,7 @@ int errBoundMode, double absErr_Bound, double relBoundRatio)
 		}
 		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
 		{
-			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			*outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
 			free(tmpByteData);
 		}
 		else
diff --git a/thirdparty/SZ/sz/src/szd_double.c b/thirdparty/SZ/sz/src/szd_double.c
index 1440e2d012977829e6fc4c907e6ebb0ac3e24c48..09d585178dd97523e5ee3ea407680da168a87168 100644
--- a/thirdparty/SZ/sz/src/szd_double.c
+++ b/thirdparty/SZ/sz/src/szd_double.c
@@ -16,6 +16,7 @@
 #include "Huffman.h"
 #include "szd_double_pwr.h"
 #include "szd_double_ts.h"
+#include "utility.h"
 
 int SZ_decompress_args_double(double** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
 {
@@ -29,10 +30,10 @@ int SZ_decompress_args_double(double** newData, size_t r5, size_t r4, size_t r3,
 	unsigned char* szTmpBytes;
 	if(cmpSize!=12+4+MetaDataByteLength && cmpSize!=12+8+MetaDataByteLength)
 	{
-		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
 		if(confparams_dec->szMode!=SZ_TEMPORAL_COMPRESSION)
 		{
-			if(isZlib)
+			if(confparams_dec->losslessCompressor!=-1)
 				confparams_dec->szMode = SZ_BEST_COMPRESSION;
 			else
 				confparams_dec->szMode = SZ_BEST_SPEED;			
@@ -46,7 +47,7 @@ int SZ_decompress_args_double(double** newData, size_t r5, size_t r4, size_t r3,
 		{
 			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
 				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 			
-			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);			
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);			
 			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
 			//memcpy(szTmpBytes, tmpBytes, tmpSize);
 			//free(tmpBytes); //release useless memory		
@@ -80,22 +81,45 @@ int SZ_decompress_args_double(double** newData, size_t r5, size_t r4, size_t r3,
 				(*newData)[i] = bytesToDouble(p);
 		}		
 	}
-	else if (dim == 1)
-		getSnapshotData_double_1D(newData,r1,tdps, errBoundMode);
-	else
-	if (dim == 2)
-		getSnapshotData_double_2D(newData,r2,r1,tdps, errBoundMode);
-	else
-	if (dim == 3)
-		getSnapshotData_double_3D(newData,r3,r2,r1,tdps, errBoundMode);
-	else
-	if (dim == 4)
-		getSnapshotData_double_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
-	else
+	else 
 	{
-		printf("Error: currently support only at most 4 dimensions!\n");
-		status = SZ_DERR;
-	}
+		if(tdps->raBytes_size > 0) //v2.0
+		{
+			if (dim == 1)
+				getSnapshotData_double_1D(newData,r1,tdps, errBoundMode);
+			else if(dim == 2)
+				decompressDataSeries_double_2D_nonblocked_with_blocked_regression(newData, r2, r1, tdps->raBytes);
+			else if(dim == 3)
+				decompressDataSeries_double_3D_nonblocked_with_blocked_regression(newData, r3, r2, r1, tdps->raBytes);
+			else if(dim == 4)
+				decompressDataSeries_double_3D_nonblocked_with_blocked_regression(newData, r4*r3, r2, r1, tdps->raBytes);
+			else
+			{
+				printf("Error: currently support only at most 4 dimensions!\n");
+				status = SZ_DERR;
+			}	
+		}
+		else //1.4.13
+		{
+			if (dim == 1)
+				getSnapshotData_double_1D(newData,r1,tdps, errBoundMode);
+			else
+			if (dim == 2)
+				getSnapshotData_double_2D(newData,r2,r1,tdps, errBoundMode);
+			else
+			if (dim == 3)
+				getSnapshotData_double_3D(newData,r3,r2,r1,tdps, errBoundMode);
+			else
+			if (dim == 4)
+				getSnapshotData_double_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);			
+			else
+			{
+				printf("Error: currently support only at most 4 dimensions!\n");
+				status = SZ_DERR;
+			}			
+		}
+	}	
+
 	free_TightDataPointStorageD2(tdps);
 	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=12+MetaDataByteLength+exe_params->SZ_SIZE_TYPE)
 		free(szTmpBytes);	
@@ -1647,8 +1671,8 @@ void getSnapshotData_double_1D(double** data, size_t dataSeriesLength, TightData
 			}
 			else 
 			{
-				//decompressDataSeries_double_1D_pwr(data, dataSeriesLength, tdps);
-				decompressDataSeries_double_1D_pwrgroup(data, dataSeriesLength, tdps);
+				decompressDataSeries_double_1D_pwr_pre_log(data, dataSeriesLength, tdps);
+				//decompressDataSeries_double_1D_pwrgroup(data, dataSeriesLength, tdps);
 			}
 			return;
 		} else {
@@ -1671,7 +1695,8 @@ void getSnapshotData_double_1D(double** data, size_t dataSeriesLength, TightData
 			if(errBoundMode < PW_REL)
 				decompressDataSeries_double_1D(&decmpData, dataSeriesLength, tdps);
 			else 
-				decompressDataSeries_double_1D_pwr(&decmpData, dataSeriesLength, tdps);
+				//decompressDataSeries_double_1D_pwr(&decmpData, dataSeriesLength, tdps);
+				decompressDataSeries_double_1D_pwr_pre_log(&decmpData, dataSeriesLength, tdps);
 			// insert the decompressed data
 			size_t k = 0;
 			for (i = 0; i < dataSeriesLength; i++) {
@@ -1711,7 +1736,8 @@ void getSnapshotData_double_2D(double** data, size_t r1, size_t r2, TightDataPoi
 					decompressDataSeries_double_2D(data, r1, r2, tdps);
 			}
 			else 
-				decompressDataSeries_double_2D_pwr(data, r1, r2, tdps);
+				//decompressDataSeries_double_2D_pwr(data, r1, r2, tdps);
+				decompressDataSeries_double_2D_pwr_pre_log(data, r1, r2, tdps);
 			return;
 		} else {
 			*data = (double*)malloc(sizeof(double)*dataSeriesLength);
@@ -1733,7 +1759,8 @@ void getSnapshotData_double_2D(double** data, size_t r1, size_t r2, TightDataPoi
 			if(errBoundMode < PW_REL)
 				decompressDataSeries_double_2D(&decmpData, r1, r2, tdps);
 			else 
-				decompressDataSeries_double_2D_pwr(&decmpData, r1, r2, tdps);
+				//decompressDataSeries_double_2D_pwr(&decmpData, r1, r2, tdps);
+				decompressDataSeries_double_2D_pwr_pre_log(&decmpData, r1, r2, tdps);
 			// insert the decompressed data
 			size_t k = 0;
 			for (i = 0; i < dataSeriesLength; i++) {
@@ -1773,7 +1800,8 @@ void getSnapshotData_double_3D(double** data, size_t r1, size_t r2, size_t r3, T
 					decompressDataSeries_double_3D(data, r1, r2, r3, tdps);
 			}
 			else 
-				decompressDataSeries_double_3D_pwr(data, r1, r2, r3, tdps);
+				//decompressDataSeries_double_3D_pwr(data, r1, r2, r3, tdps);
+				decompressDataSeries_double_3D_pwr_pre_log(data, r1, r2, r3, tdps);
 			return;
 		} else {
 			*data = (double*)malloc(sizeof(double)*dataSeriesLength);
@@ -1795,7 +1823,8 @@ void getSnapshotData_double_3D(double** data, size_t r1, size_t r2, size_t r3, T
 			if(errBoundMode < PW_REL)
 				decompressDataSeries_double_3D(&decmpData, r1, r2, r3, tdps);
 			else 
-				decompressDataSeries_double_3D_pwr(&decmpData, r1, r2, r3, tdps);			
+				//decompressDataSeries_double_3D_pwr(&decmpData, r1, r2, r3, tdps);			
+				decompressDataSeries_double_3D_pwr_pre_log(&decmpData, r1, r2, r3, tdps);			
 			// insert the decompressed data
 			size_t k = 0;
 			for (i = 0; i < dataSeriesLength; i++) {
@@ -1836,7 +1865,8 @@ void getSnapshotData_double_4D(double** data, size_t r1, size_t r2, size_t r3, s
 			}
 			else 
 			{
-				decompressDataSeries_double_3D_pwr(data, r1*r2, r3, r4, tdps);
+				//decompressDataSeries_double_3D_pwr(data, r1*r2, r3, r4, tdps);
+				decompressDataSeries_double_3D_pwr_pre_log(data, r1*r2, r3, r4, tdps);
 				//ToDO
 				//decompressDataSeries_double_4D_pwr(data, r1, r2, r3, r4, tdps);
 			}					
@@ -1858,7 +1888,8 @@ void getSnapshotData_double_4D(double** data, size_t r1, size_t r2, size_t r3, s
 			if(errBoundMode < PW_REL)
 				decompressDataSeries_double_4D(&decmpData, r1, r2, r3, r4, tdps);
 			else
-				decompressDataSeries_double_3D_pwr(&decmpData, r1*r2, r3, r4, tdps);
+				//decompressDataSeries_double_3D_pwr(&decmpData, r1*r2, r3, r4, tdps);
+				decompressDataSeries_double_3D_pwr_pre_log(&decmpData, r1*r2, r3, r4, tdps);
 				//ToDo
 				//decompressDataSeries_double_4D_pwr(&decmpData, r1, r2, r3, r4, tdps);
 			// insert the decompressed data
@@ -1873,3 +1904,2719 @@ void getSnapshotData_double_4D(double** data, size_t r1, size_t r2, size_t r3, s
 		}
 	}
 }
+
+void decompressDataSeries_double_2D_nonblocked_with_blocked_regression(double** data, size_t r1, size_t r2, unsigned char* comp_data){
+
+	size_t dim0_offset = r2;
+	size_t num_elements = r1 * r2;
+
+	*data = (double*)malloc(sizeof(double)*num_elements);
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x, num_y;
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+
+	size_t split_index_x, split_index_y;
+	size_t early_blockcount_x, early_blockcount_y;
+	size_t late_blockcount_x, late_blockcount_y;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+
+	size_t num_blocks = num_x * num_y;
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+sizeof(int), nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	double mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(double));
+	comp_data_pos += sizeof(double);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+	//printf("reg_count: %ld\n", reg_count);
+
+	int coeff_intvRadius[3];
+	int * coeff_result_type = (int *) malloc(num_blocks*3*sizeof(int));
+	int * coeff_type[3];
+	double precision[3];
+	double * coeff_unpred_data[3];
+	if(reg_count > 0){
+		for(int i=0; i<3; i++){
+			precision[i] = bytesToDouble(comp_data_pos);
+			comp_data_pos += sizeof(double);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+sizeof(int), nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (double *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(double);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	double last_coefficients[3] = {0.0};
+	int coeff_unpred_data_count[3] = {0};
+	int coeff_index = 0;
+	updateQuantizationInfo(intervals);
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	double * unpred_data = (double *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(double);
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	decode(comp_data_pos, num_elements, root, result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	int intvRadius = exe_params->intvRadius;
+	
+	int * type;
+
+	double * data_pos = *data;
+	size_t offset_x, offset_y;
+	size_t current_blockcount_x, current_blockcount_y;
+	size_t cur_unpred_count;
+
+	unsigned char * indicator_pos = indicator;
+	if(use_mean){
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				data_pos = *data + offset_x * dim0_offset + offset_y;
+
+				current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+
+				size_t current_block_elements = current_blockcount_x * current_blockcount_y;
+				if(*indicator_pos){
+					// decompress by SZ
+
+					double * block_data_pos = data_pos;
+					double pred;
+					size_t index = 0;
+					int type_;
+					// d11 is current data
+					size_t unpredictable_count = 0;
+					double d00, d01, d10;
+					for(size_t ii=0; ii<current_blockcount_x; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							type_ = type[index];
+							if(type_ == intvRadius){
+								*block_data_pos = mean;
+							}
+							else if(type_ == 0){
+								*block_data_pos = unpred_data[unpredictable_count ++];
+							}
+							else{
+								d00 = d01 = d10 = 1;
+								if(i == 0 && ii == 0){
+									d00 = d01 = 0;
+								}
+								if(j == 0 && jj == 0){
+									d00 = d10 = 0;
+								}
+								if(d00){
+									d00 = block_data_pos[- dim0_offset - 1];
+								}
+								if(d01){
+									d01 = block_data_pos[- dim0_offset];
+								}
+								if(d10){
+									d10 = block_data_pos[- 1];
+								}
+								if(type_ < intvRadius) type_ += 1;
+								pred = d10 + d01 - d00;
+								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+							}
+							index ++;
+							block_data_pos ++;
+						}
+						block_data_pos += dim0_offset - current_blockcount_y;
+					}
+					cur_unpred_count = unpredictable_count;
+				}
+				else{
+					// decompress by regression
+					{
+						//restore regression coefficients
+						double pred;
+						int type_;
+						for(int e=0; e<3; e++){
+							type_ = coeff_type[e][coeff_index];
+							if (type_ != 0){
+								pred = last_coefficients[e];
+								last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+							}
+							else{
+								last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+								coeff_unpred_data_count[e] ++;
+							}
+						}
+						coeff_index ++;
+					}
+					{
+						double * block_data_pos = data_pos;
+						double pred;
+						int type_;
+						size_t index = 0;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								type_ = type[index];
+								if (type_ != 0){
+									pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2];
+									*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+								}
+								else{
+									*block_data_pos = unpred_data[unpredictable_count ++];
+								}
+
+								index ++;	
+								block_data_pos ++;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+				}
+
+				type += current_block_elements;
+				indicator_pos ++;
+				unpred_data += cur_unpred_count;
+			}
+		}
+	}
+	else{
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				data_pos = *data + offset_x * dim0_offset + offset_y;
+
+				current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+
+				size_t current_block_elements = current_blockcount_x * current_blockcount_y;
+				if(*indicator_pos){
+					// decompress by SZ
+					
+					double * block_data_pos = data_pos;
+					double pred;
+					size_t index = 0;
+					int type_;
+					// d11 is current data
+					size_t unpredictable_count = 0;
+					double d00, d01, d10;
+					for(size_t ii=0; ii<current_blockcount_x; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							type_ = type[index];
+							if(type_ == 0){
+								*block_data_pos = unpred_data[unpredictable_count ++];
+							}
+							else{
+								d00 = d01 = d10 = 1;
+								if(i == 0 && ii == 0){
+									d00 = d01 = 0;
+								}
+								if(j == 0 && jj == 0){
+									d00 = d10 = 0;
+								}
+								if(d00){
+									d00 = block_data_pos[- dim0_offset - 1];
+								}
+								if(d01){
+									d01 = block_data_pos[- dim0_offset];
+								}
+								if(d10){
+									d10 = block_data_pos[- 1];
+								}
+								pred = d10 + d01 - d00;
+								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+							}
+							index ++;
+							block_data_pos ++;
+						}
+						block_data_pos += dim0_offset - current_blockcount_y;
+					}
+					cur_unpred_count = unpredictable_count;
+				}
+				else{
+					// decompress by regression
+					{
+						//restore regression coefficients
+						double pred;
+						int type_;
+						for(int e=0; e<3; e++){
+							type_ = coeff_type[e][coeff_index];
+							if (type_ != 0){
+								pred = last_coefficients[e];
+								last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+							}
+							else{
+								last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+								coeff_unpred_data_count[e] ++;
+							}
+						}
+						coeff_index ++;
+					}
+					{
+						double * block_data_pos = data_pos;
+						double pred;
+						int type_;
+						size_t index = 0;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								type_ = type[index];
+								if (type_ != 0){
+									pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2];
+									*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+								}
+								else{
+									*block_data_pos = unpred_data[unpredictable_count ++];
+								}
+								index ++;	
+								block_data_pos ++;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+				}
+
+				type += current_block_elements;
+				indicator_pos ++;
+				unpred_data += cur_unpred_count;
+			}
+		}
+	}
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+}
+
+
+void decompressDataSeries_double_3D_nonblocked_with_blocked_regression(double** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data){
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	size_t num_elements = r1 * r2 * r3;
+
+	*data = (double*)malloc(sizeof(double)*num_elements);
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r3, num_z, block_size);
+
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t num_blocks = num_x * num_y * num_z;
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+4, nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	double mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(double));
+	comp_data_pos += sizeof(double);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+
+	int coeff_intvRadius[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	int * coeff_type[4];
+	double precision[4];
+	double * coeff_unpred_data[4];
+	if(reg_count > 0){
+		for(int i=0; i<4; i++){
+			precision[i] = bytesToDouble(comp_data_pos);
+			comp_data_pos += sizeof(double);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+4, nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (double *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(double);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	double last_coefficients[4] = {0.0};
+	int coeff_unpred_data_count[4] = {0};
+	int coeff_index = 0;
+	updateQuantizationInfo(intervals);
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	double * unpred_data = (double *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(double);
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	decode(comp_data_pos, num_elements, root, result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	int intvRadius = exe_params->intvRadius;
+	
+	int * type;
+	double * data_pos = *data;
+	size_t offset_x, offset_y, offset_z;
+	size_t current_blockcount_x, current_blockcount_y, current_blockcount_z;
+	size_t cur_unpred_count;
+	unsigned char * indicator_pos = indicator;
+	if(use_mean){
+		// type = result_type;
+
+		// for(size_t i=0; i<num_x; i++){
+		// 	for(size_t j=0; j<num_y; j++){
+		// 		for(size_t k=0; k<num_z; k++){
+		// 			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		// 			offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		// 			offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		// 			data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+		// 			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		// 			current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		// 			current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+		// 			// type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		// 			// type = result_type + type_offset;
+		// 			size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+		// 			// index = i * num_y * num_z + j * num_z + k;
+
+		// 			// printf("i j k: %ld %ld %ld\toffset: %ld %ld %ld\tindicator: %ld\n", i, j, k, offset_x, offset_y, offset_z, indicator[index]);
+		// 			if(*indicator_pos){
+		// 				// decompress by SZ
+		// 				// cur_unpred_count = decompressDataSeries_double_3D_blocked_nonblock_pred(data_pos, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, i, j, k, realPrecision, type, unpred_data);
+		// 				double * block_data_pos = data_pos;
+		// 				double pred;
+		// 				size_t index = 0;
+		// 				int type_;
+		// 				// d111 is current data
+		// 				size_t unpredictable_count = 0;
+		// 				double d000, d001, d010, d011, d100, d101, d110;
+		// 				for(size_t ii=0; ii<current_blockcount_x; ii++){
+		// 					for(size_t jj=0; jj<current_blockcount_y; jj++){
+		// 						for(size_t kk=0; kk<current_blockcount_z; kk++){
+		// 							type_ = type[index];
+		// 							if(type_ == intvRadius){
+		// 								*block_data_pos = mean;
+		// 							}
+		// 							else if(type_ == 0){
+		// 								*block_data_pos = unpred_data[unpredictable_count ++];
+		// 							}
+		// 							else{
+		// 								d000 = d001 = d010 = d011 = d100 = d101 = d110 = 1;
+		// 								if(i == 0 && ii == 0){
+		// 									d000 = d001 = d010 = d011 = 0;
+		// 								}
+		// 								if(j == 0 && jj == 0){
+		// 									d000 = d001 = d100 = d101 = 0;
+		// 								}
+		// 								if(k == 0 && kk == 0){
+		// 									d000 = d010 = d100 = d110 = 0;
+		// 								}
+		// 								if(d000){
+		// 									d000 = block_data_pos[- dim0_offset - dim1_offset - 1];
+		// 								}
+		// 								if(d001){
+		// 									d001 = block_data_pos[- dim0_offset - dim1_offset];
+		// 								}
+		// 								if(d010){
+		// 									d010 = block_data_pos[- dim0_offset - 1];
+		// 								}
+		// 								if(d011){
+		// 									d011 = block_data_pos[- dim0_offset];
+		// 								}
+		// 								if(d100){
+		// 									d100 = block_data_pos[- dim1_offset - 1];
+		// 								}
+		// 								if(d101){
+		// 									d101 = block_data_pos[- dim1_offset];
+		// 								}
+		// 								if(d110){
+		// 									d110 = block_data_pos[- 1];
+		// 								}
+		// 								if(type_ < intvRadius) type_ += 1;
+		// 								pred = d110 + d101 + d011 - d100 - d010 - d001 + d000;
+		// 								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+		// 							}
+		// 							index ++;
+		// 							block_data_pos ++;
+		// 						}
+		// 						block_data_pos += dim1_offset - current_blockcount_z;
+		// 					}
+		// 					block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+		// 				}
+		// 				cur_unpred_count = unpredictable_count;
+		// 			}
+		// 			else{
+		// 				// decompress by regression
+		// 				{
+		// 					//restore regression coefficients
+		// 					double pred;
+		// 					int type_;
+		// 					for(int e=0; e<4; e++){
+		// 						// if(i == 0 && j == 0 && k == 19){
+		// 						// 	printf("~\n");
+		// 						// }
+		// 						type_ = coeff_type[e][coeff_index];
+		// 						if (type_ != 0){
+		// 							pred = last_coefficients[e];
+		// 							last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+		// 						}
+		// 						else{
+		// 							last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+		// 							coeff_unpred_data_count[e] ++;
+		// 						}
+		// 						if(fabs(last_coefficients[e]) > 10000){
+		// 							printf("%d %d %d-%d: pred %.4f type %d precision %.4g last_coefficients %.4g\n", i, j, k, e, pred, type_, precision[e], last_coefficients[e]);
+		// 							exit(0);
+		// 						}
+		// 					}
+		// 					coeff_index ++;
+		// 				}
+		// 				{
+		// 					double * block_data_pos = data_pos;
+		// 					double pred;
+		// 					int type_;
+		// 					size_t index = 0;
+		// 					size_t unpredictable_count = 0;
+		// 					for(size_t ii=0; ii<current_blockcount_x; ii++){
+		// 						for(size_t jj=0; jj<current_blockcount_y; jj++){
+		// 							for(size_t kk=0; kk<current_blockcount_z; kk++){
+		// 								if(block_data_pos - (*data) == 19470788){
+		// 									printf("dec stop\n");
+		// 								}
+
+		// 								type_ = type[index];
+		// 								if (type_ != 0){
+		// 									pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+		// 									*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+		// 								}
+		// 								else{
+		// 									*block_data_pos = unpred_data[unpredictable_count ++];
+		// 								}
+		// 								index ++;	
+		// 								block_data_pos ++;
+		// 							}
+		// 							block_data_pos += dim1_offset - current_blockcount_z;
+		// 						}
+		// 						block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+		// 					}
+		// 					cur_unpred_count = unpredictable_count;
+		// 				}
+		// 			}
+
+		// 			type += current_block_elements;
+		// 			indicator_pos ++;
+		// 			unpred_data += cur_unpred_count;
+		// 			// decomp_unpred += cur_unpred_count;
+		// 			// printf("block comp done, data_offset from %ld to %ld: diff %ld\n", *data, data_pos, data_pos - *data);
+		// 			// fflush(stdout);
+		// 		}
+		// 	}
+		// }
+
+		type = result_type;
+		// i == 0
+		{
+			// j == 0
+			{
+				// k == 0
+				{
+					data_pos = *data;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = 0;
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;						
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				// i == 0 j == 0 k != 0
+				for(size_t k=1; k<num_z; k++){
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j==0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_y * dim1_offset;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		} // end i==0
+		for(size_t i=1; i<num_x; i++){
+			// j == 0
+			{
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					data_pos = *data + offset_x * dim0_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j = 0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		}
+	}
+	else{
+		type = result_type;
+		// i == 0
+		{
+			// j == 0
+			{
+				// k == 0
+				{
+					data_pos = *data;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = 0;
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;						
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				// i == 0 j == 0 k != 0
+				for(size_t k=1; k<num_z; k++){
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j==0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_y * dim1_offset;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		} // end i==0
+		for(size_t i=1; i<num_x; i++){
+			// j == 0
+			{
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					data_pos = *data + offset_x * dim0_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j = 0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						double * block_data_pos = data_pos;
+						double pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							double pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							double * block_data_pos = data_pos;
+							double pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		}
+	}
+
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+}
diff --git a/thirdparty/SZ/sz/src/szd_double_pwr.c b/thirdparty/SZ/sz/src/szd_double_pwr.c
index f4a6fd861df7f8a1fd5fa231a009a3dda69114b4..a3ec18e583168a8a71c55121709f219492cd6a5c 100644
--- a/thirdparty/SZ/sz/src/szd_double_pwr.c
+++ b/thirdparty/SZ/sz/src/szd_double_pwr.c
@@ -13,6 +13,7 @@
 #include "TightDataPointStorageD.h"
 #include "sz.h"
 #include "Huffman.h"
+#include "utility.h"
 //#include "rw.h"
 
 #pragma GCC diagnostic push
@@ -1347,4 +1348,77 @@ void decompressDataSeries_double_1D_pwrgroup(double** data, size_t dataSeriesLen
 	free(groupErrorBounds);
 	free(groupID);
 }
+
+void decompressDataSeries_double_1D_pwr_pre_log(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps) {
+
+	decompressDataSeries_double_1D(data, dataSeriesLength, tdps);
+	double threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		sz_lossless_decompress(confparams_dec->losslessCompressor, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+			if(signs[i]) (*data)[i] = -((*data)[i]);
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+		}
+	}
+
+}
+
+void decompressDataSeries_double_2D_pwr_pre_log(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps) {
+
+	size_t dataSeriesLength = r1 * r2;
+	decompressDataSeries_double_2D(data, r1, r2, tdps);
+	double threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		sz_lossless_decompress(confparams_dec->losslessCompressor, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+			if(signs[i]) (*data)[i] = -((*data)[i]);
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+		}
+	}
+}
+
+void decompressDataSeries_double_3D_pwr_pre_log(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps) {
+
+	size_t dataSeriesLength = r1 * r2 * r3;
+	decompressDataSeries_double_3D(data, r1, r2, r3, tdps);
+	double threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		sz_lossless_decompress(confparams_dec->losslessCompressor, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+			if(signs[i]) (*data)[i] = -((*data)[i]);
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+		}
+	}
+}
+
 #pragma GCC diagnostic pop
diff --git a/thirdparty/SZ/sz/src/szd_float.c b/thirdparty/SZ/sz/src/szd_float.c
index 5a420afeeec036848fe70ba58c3db3a150c3d657..0c7df9e334bff84064d1a5d5265cea5fa51d3bdf 100644
--- a/thirdparty/SZ/sz/src/szd_float.c
+++ b/thirdparty/SZ/sz/src/szd_float.c
@@ -1,7 +1,7 @@
 /**
  *  @file szd_float.c
- *  @author Sheng Di and Dingwen Tao
- *  @date Aug, 2016
+ *  @author Sheng Di, Dingwen Tao, Xin Liang
+ *  @date Aug, 2018
  *  @brief 
  *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
  *      See COPYRIGHT in top-level directory.
@@ -16,6 +16,7 @@
 #include "Huffman.h"
 #include "szd_float_pwr.h"
 #include "szd_float_ts.h"
+#include "utility.h"
 
 /**
  * 
@@ -35,10 +36,10 @@ int SZ_decompress_args_float(float** newData, size_t r5, size_t r4, size_t r3, s
 	
 	if(cmpSize!=8+4+MetaDataByteLength && cmpSize!=8+8+MetaDataByteLength) //4,8 means two posibilities of SZ_SIZE_TYPE
 	{
-		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
 		if(confparams_dec->szMode!=SZ_TEMPORAL_COMPRESSION)
 		{
-			if(isZlib)
+			if(confparams_dec->losslessCompressor!=-1)
 				confparams_dec->szMode = SZ_BEST_COMPRESSION;
 			else
 				confparams_dec->szMode = SZ_BEST_SPEED;			
@@ -53,7 +54,7 @@ int SZ_decompress_args_float(float** newData, size_t r5, size_t r4, size_t r3, s
 		{
 			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
 				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
-			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
 			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
 			//memcpy(szTmpBytes, tmpBytes, tmpSize);
 			//free(tmpBytes); //release useless memory		
@@ -88,21 +89,40 @@ int SZ_decompress_args_float(float** newData, size_t r5, size_t r4, size_t r3, s
 				(*newData)[i] = bytesToFloat(p);
 		}		
 	}
-	else if (dim == 1)
-		getSnapshotData_float_1D(newData,r1,tdps, errBoundMode);
-	else
-	if (dim == 2)
-		getSnapshotData_float_2D(newData,r2,r1,tdps, errBoundMode);
-	else
-	if (dim == 3)
-		getSnapshotData_float_3D(newData,r3,r2,r1,tdps, errBoundMode);
-	else
-	if (dim == 4)
-		getSnapshotData_float_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
-	else
+	else 
 	{
-		printf("Error: currently support only at most 4 dimensions!\n");
-		status = SZ_DERR;
+		if(tdps->raBytes_size > 0) //v2.0
+		{
+			if (dim == 1)
+				getSnapshotData_float_1D(newData,r1,tdps, errBoundMode);
+			else if(dim == 2)
+				decompressDataSeries_float_2D_nonblocked_with_blocked_regression(newData, r2, r1, tdps->raBytes);
+			else if(dim == 3)
+				decompressDataSeries_float_3D_nonblocked_with_blocked_regression(newData, r3, r2, r1, tdps->raBytes);
+			else if(dim == 4)
+				decompressDataSeries_float_3D_nonblocked_with_blocked_regression(newData, r4*r3, r2, r1, tdps->raBytes);
+			else
+			{
+				printf("Error: currently support only at most 4 dimensions!\n");
+				status = SZ_DERR;
+			}	
+		}
+		else //1.4.13
+		{
+			if (dim == 1)
+				getSnapshotData_float_1D(newData,r1,tdps, errBoundMode);
+			else if (dim == 2)
+				getSnapshotData_float_2D(newData,r2,r1,tdps, errBoundMode);
+			else if (dim == 3)
+				getSnapshotData_float_3D(newData,r3,r2,r1,tdps, errBoundMode);
+			else if (dim == 4)
+				getSnapshotData_float_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+			else
+			{
+				printf("Error: currently support only at most 4 dimensions!\n");
+				status = SZ_DERR;
+			}			
+		}
 	}
 	free_TightDataPointStorageF2(tdps);
 	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=8+MetaDataByteLength+exe_params->SZ_SIZE_TYPE)
@@ -1651,8 +1671,8 @@ void getSnapshotData_float_1D(float** data, size_t dataSeriesLength, TightDataPo
 			}
 			else 
 			{
-				//decompressDataSeries_float_1D_pwr(data, dataSeriesLength, tdps);
-				decompressDataSeries_float_1D_pwrgroup(data, dataSeriesLength, tdps);
+				decompressDataSeries_float_1D_pwr_pre_log(data, dataSeriesLength, tdps);
+				//decompressDataSeries_float_1D_pwrgroup(data, dataSeriesLength, tdps);
 			}
 			return;
 		} else {
@@ -1675,7 +1695,8 @@ void getSnapshotData_float_1D(float** data, size_t dataSeriesLength, TightDataPo
 			if(errBoundMode < PW_REL)
 				decompressDataSeries_float_1D(&decmpData, dataSeriesLength, tdps);
 			else 
-				decompressDataSeries_float_1D_pwr(&decmpData, dataSeriesLength, tdps);
+				//decompressDataSeries_float_1D_pwr(&decmpData, dataSeriesLength, tdps);
+				decompressDataSeries_float_1D_pwr_pre_log(&decmpData, dataSeriesLength, tdps);
 			// insert the decompressed data
 			size_t k = 0;
 			for (i = 0; i < dataSeriesLength; i++) {
@@ -1716,7 +1737,8 @@ void getSnapshotData_float_2D(float** data, size_t r1, size_t r2, TightDataPoint
 			}
 			else 
 			{
-				decompressDataSeries_float_2D_pwr(data, r1, r2, tdps);
+				//decompressDataSeries_float_2D_pwr(data, r1, r2, tdps);
+				decompressDataSeries_float_2D_pwr_pre_log(data, r1, r2, tdps);
 			}			
 
 			return;
@@ -1740,7 +1762,8 @@ void getSnapshotData_float_2D(float** data, size_t r1, size_t r2, TightDataPoint
 			if(errBoundMode < PW_REL)
 				decompressDataSeries_float_2D(&decmpData, r1, r2, tdps);
 			else 
-				decompressDataSeries_float_2D_pwr(&decmpData, r1, r2, tdps);
+				//decompressDataSeries_float_2D_pwr(&decmpData, r1, r2, tdps);
+				decompressDataSeries_float_2D_pwr_pre_log(&decmpData, r1, r2, tdps);
 			// insert the decompressed data
 			size_t k = 0;
 			for (i = 0; i < dataSeriesLength; i++) {
@@ -1773,7 +1796,7 @@ void getSnapshotData_float_3D(float** data, size_t r1, size_t r2, size_t r3, Tig
 					if(multisteps->compressionType == 0)
 						decompressDataSeries_float_3D(data, r1, r2, r3, tdps);
 					else
-						decompressDataSeries_float_1D_ts(data, r1*r2*r3, multisteps, tdps);					
+						decompressDataSeries_float_1D_ts(data, dataSeriesLength, multisteps, tdps);					
 				}
 				else
 #endif				
@@ -1781,7 +1804,8 @@ void getSnapshotData_float_3D(float** data, size_t r1, size_t r2, size_t r3, Tig
 			}
 			else 
 			{
-				decompressDataSeries_float_3D_pwr(data, r1, r2, r3, tdps);
+				//decompressDataSeries_float_3D_pwr(data, r1, r2, r3, tdps);
+				decompressDataSeries_float_3D_pwr_pre_log(data, r1, r2, r3, tdps);
 			}					
 			
 			return;
@@ -1805,7 +1829,8 @@ void getSnapshotData_float_3D(float** data, size_t r1, size_t r2, size_t r3, Tig
 			if(errBoundMode < PW_REL)
 				decompressDataSeries_float_3D(&decmpData, r1, r2, r3, tdps);
 			else 
-				decompressDataSeries_float_3D_pwr(&decmpData, r1, r2, r3, tdps);
+				//decompressDataSeries_float_3D_pwr(&decmpData, r1, r2, r3, tdps);
+				decompressDataSeries_float_3D_pwr_pre_log(&decmpData, r1, r2, r3, tdps);
 			// insert the decompressed data
 			size_t k = 0;
 			for (i = 0; i < dataSeriesLength; i++) {
@@ -1846,7 +1871,8 @@ void getSnapshotData_float_4D(float** data, size_t r1, size_t r2, size_t r3, siz
 			}
 			else 
 			{
-				decompressDataSeries_float_3D_pwr(data, r1*r2, r3, r4, tdps);
+				//decompressDataSeries_float_3D_pwr(data, r1*r2, r3, r4, tdps);
+				decompressDataSeries_float_3D_pwr_pre_log(data, r1*r2, r3, r4, tdps);
 				//ToDO
 				//decompressDataSeries_float_4D_pwr(data, r1, r2, r3, r4, tdps);
 			}					
@@ -1868,7 +1894,8 @@ void getSnapshotData_float_4D(float** data, size_t r1, size_t r2, size_t r3, siz
 			if(errBoundMode < PW_REL)
 				decompressDataSeries_float_4D(&decmpData, r1, r2, r3, r4, tdps);
 			else
-				decompressDataSeries_float_3D_pwr(&decmpData, r1*r2, r3, r4, tdps);
+				//decompressDataSeries_float_3D_pwr(&decmpData, r1*r2, r3, r4, tdps);
+				decompressDataSeries_float_3D_pwr_pre_log(&decmpData, r1*r2, r3, r4, tdps);
 				//ToDO
 				//decompressDataSeries_float_4D_pwr(&decompData, r1, r2, r3, r4, tdps);
 			// insert the decompressed data
@@ -2177,3 +2204,3059 @@ size_t decompressDataSeries_float_2D_RA_block(float * data, float mean, size_t d
 	return unpredictable_count;
 }
 
+void decompressDataSeries_float_2D_nonblocked_with_blocked_regression(float** data, size_t r1, size_t r2, unsigned char* comp_data){
+
+	size_t dim0_offset = r2;
+	size_t num_elements = r1 * r2;
+
+	*data = (float*)malloc(sizeof(float)*num_elements);
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x, num_y;
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+
+	size_t split_index_x, split_index_y;
+	size_t early_blockcount_x, early_blockcount_y;
+	size_t late_blockcount_x, late_blockcount_y;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+
+	size_t num_blocks = num_x * num_y;
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+sizeof(int), nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	float mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(float));
+	comp_data_pos += sizeof(float);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+	//printf("reg_count: %ld\n", reg_count);
+
+	int coeff_intvRadius[3];
+	int * coeff_result_type = (int *) malloc(num_blocks*3*sizeof(int));
+	int * coeff_type[3];
+	double precision[3];
+	float * coeff_unpred_data[3];
+	if(reg_count > 0){
+		for(int i=0; i<3; i++){
+			precision[i] = bytesToDouble(comp_data_pos);
+			comp_data_pos += sizeof(double);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+sizeof(int), nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (float *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	float last_coefficients[3] = {0.0};
+	int coeff_unpred_data_count[3] = {0};
+	int coeff_index = 0;
+	updateQuantizationInfo(intervals);
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	float * unpred_data = (float *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(float);
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	decode(comp_data_pos, num_elements, root, result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	int intvRadius = exe_params->intvRadius;
+	
+	int * type;
+
+	float * data_pos = *data;
+	size_t offset_x, offset_y;
+	size_t current_blockcount_x, current_blockcount_y;
+	size_t cur_unpred_count;
+
+	unsigned char * indicator_pos = indicator;
+	if(use_mean){
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				data_pos = *data + offset_x * dim0_offset + offset_y;
+
+				current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+
+				size_t current_block_elements = current_blockcount_x * current_blockcount_y;
+				if(*indicator_pos){
+					// decompress by SZ
+
+					float * block_data_pos = data_pos;
+					float pred;
+					size_t index = 0;
+					int type_;
+					// d11 is current data
+					size_t unpredictable_count = 0;
+					float d00, d01, d10;
+					for(size_t ii=0; ii<current_blockcount_x; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							type_ = type[index];
+							if(type_ == intvRadius){
+								*block_data_pos = mean;
+							}
+							else if(type_ == 0){
+								*block_data_pos = unpred_data[unpredictable_count ++];
+							}
+							else{
+								d00 = d01 = d10 = 1;
+								if(i == 0 && ii == 0){
+									d00 = d01 = 0;
+								}
+								if(j == 0 && jj == 0){
+									d00 = d10 = 0;
+								}
+								if(d00){
+									d00 = block_data_pos[- dim0_offset - 1];
+								}
+								if(d01){
+									d01 = block_data_pos[- dim0_offset];
+								}
+								if(d10){
+									d10 = block_data_pos[- 1];
+								}
+								if(type_ < intvRadius) type_ += 1;
+								pred = d10 + d01 - d00;
+								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+							}
+							index ++;
+							block_data_pos ++;
+						}
+						block_data_pos += dim0_offset - current_blockcount_y;
+					}
+					cur_unpred_count = unpredictable_count;
+				}
+				else{
+					// decompress by regression
+					{
+						//restore regression coefficients
+						float pred;
+						int type_;
+						for(int e=0; e<3; e++){
+							type_ = coeff_type[e][coeff_index];
+							if (type_ != 0){
+								pred = last_coefficients[e];
+								last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+							}
+							else{
+								last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+								coeff_unpred_data_count[e] ++;
+							}
+						}
+						coeff_index ++;
+					}
+					{
+						float * block_data_pos = data_pos;
+						float pred;
+						int type_;
+						size_t index = 0;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								type_ = type[index];
+								if (type_ != 0){
+									pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2];
+									*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+								}
+								else{
+									*block_data_pos = unpred_data[unpredictable_count ++];
+								}
+
+								index ++;	
+								block_data_pos ++;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+				}
+
+				type += current_block_elements;
+				indicator_pos ++;
+				unpred_data += cur_unpred_count;
+			}
+		}
+	}
+	else{
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+				offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+				data_pos = *data + offset_x * dim0_offset + offset_y;
+
+				current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+				current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+
+				size_t current_block_elements = current_blockcount_x * current_blockcount_y;
+				if(*indicator_pos){
+					// decompress by SZ
+					
+					float * block_data_pos = data_pos;
+					float pred;
+					size_t index = 0;
+					int type_;
+					// d11 is current data
+					size_t unpredictable_count = 0;
+					float d00, d01, d10;
+					for(size_t ii=0; ii<current_blockcount_x; ii++){
+						for(size_t jj=0; jj<current_blockcount_y; jj++){
+							type_ = type[index];
+							if(type_ == 0){
+								*block_data_pos = unpred_data[unpredictable_count ++];
+							}
+							else{
+								d00 = d01 = d10 = 1;
+								if(i == 0 && ii == 0){
+									d00 = d01 = 0;
+								}
+								if(j == 0 && jj == 0){
+									d00 = d10 = 0;
+								}
+								if(d00){
+									d00 = block_data_pos[- dim0_offset - 1];
+								}
+								if(d01){
+									d01 = block_data_pos[- dim0_offset];
+								}
+								if(d10){
+									d10 = block_data_pos[- 1];
+								}
+								pred = d10 + d01 - d00;
+								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+							}
+							index ++;
+							block_data_pos ++;
+						}
+						block_data_pos += dim0_offset - current_blockcount_y;
+					}
+					cur_unpred_count = unpredictable_count;
+				}
+				else{
+					// decompress by regression
+					{
+						//restore regression coefficients
+						float pred;
+						int type_;
+						for(int e=0; e<3; e++){
+							type_ = coeff_type[e][coeff_index];
+							if (type_ != 0){
+								pred = last_coefficients[e];
+								last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+							}
+							else{
+								last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+								coeff_unpred_data_count[e] ++;
+							}
+						}
+						coeff_index ++;
+					}
+					{
+						float * block_data_pos = data_pos;
+						float pred;
+						int type_;
+						size_t index = 0;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								type_ = type[index];
+								if (type_ != 0){
+									pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2];
+									*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+								}
+								else{
+									*block_data_pos = unpred_data[unpredictable_count ++];
+								}
+								index ++;	
+								block_data_pos ++;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+				}
+
+				type += current_block_elements;
+				indicator_pos ++;
+				unpred_data += cur_unpred_count;
+			}
+		}
+	}
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+}
+
+
+void decompressDataSeries_float_3D_nonblocked_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data){
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	size_t num_elements = r1 * r2 * r3;
+
+	*data = (float*)malloc(sizeof(float)*num_elements);
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+	SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r3, num_z, block_size);
+
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t num_blocks = num_x * num_y * num_z;
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+sizeof(int), nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	float mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(float));
+	comp_data_pos += sizeof(float);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+
+	int coeff_intvRadius[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	int * coeff_type[4];
+	double precision[4];
+	float * coeff_unpred_data[4];
+	if(reg_count > 0){
+		for(int i=0; i<4; i++){
+			precision[i] = bytesToDouble(comp_data_pos);
+			comp_data_pos += sizeof(double);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+sizeof(int), nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (float *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	float last_coefficients[4] = {0.0};
+	int coeff_unpred_data_count[4] = {0};
+	int coeff_index = 0;
+	updateQuantizationInfo(intervals);
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	float * unpred_data = (float *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(float);
+
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	decode(comp_data_pos, num_elements, root, result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	int intvRadius = exe_params->intvRadius;
+	
+	int * type;
+	float * data_pos = *data;
+	size_t offset_x, offset_y, offset_z;
+	size_t current_blockcount_x, current_blockcount_y, current_blockcount_z;
+	size_t cur_unpred_count;
+	unsigned char * indicator_pos = indicator;
+	if(use_mean){
+		// type = result_type;
+
+		// for(size_t i=0; i<num_x; i++){
+		// 	for(size_t j=0; j<num_y; j++){
+		// 		for(size_t k=0; k<num_z; k++){
+		// 			offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		// 			offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		// 			offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		// 			data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+		// 			current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		// 			current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		// 			current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+		// 			// type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		// 			// type = result_type + type_offset;
+		// 			size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+		// 			// index = i * num_y * num_z + j * num_z + k;
+
+		// 			// printf("i j k: %ld %ld %ld\toffset: %ld %ld %ld\tindicator: %ld\n", i, j, k, offset_x, offset_y, offset_z, indicator[index]);
+		// 			if(*indicator_pos){
+		// 				// decompress by SZ
+		// 				// cur_unpred_count = decompressDataSeries_float_3D_blocked_nonblock_pred(data_pos, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, i, j, k, realPrecision, type, unpred_data);
+		// 				float * block_data_pos = data_pos;
+		// 				float pred;
+		// 				size_t index = 0;
+		// 				int type_;
+		// 				// d111 is current data
+		// 				size_t unpredictable_count = 0;
+		// 				float d000, d001, d010, d011, d100, d101, d110;
+		// 				for(size_t ii=0; ii<current_blockcount_x; ii++){
+		// 					for(size_t jj=0; jj<current_blockcount_y; jj++){
+		// 						for(size_t kk=0; kk<current_blockcount_z; kk++){
+		// 							type_ = type[index];
+		// 							if(type_ == intvRadius){
+		// 								*block_data_pos = mean;
+		// 							}
+		// 							else if(type_ == 0){
+		// 								*block_data_pos = unpred_data[unpredictable_count ++];
+		// 							}
+		// 							else{
+		// 								d000 = d001 = d010 = d011 = d100 = d101 = d110 = 1;
+		// 								if(i == 0 && ii == 0){
+		// 									d000 = d001 = d010 = d011 = 0;
+		// 								}
+		// 								if(j == 0 && jj == 0){
+		// 									d000 = d001 = d100 = d101 = 0;
+		// 								}
+		// 								if(k == 0 && kk == 0){
+		// 									d000 = d010 = d100 = d110 = 0;
+		// 								}
+		// 								if(d000){
+		// 									d000 = block_data_pos[- dim0_offset - dim1_offset - 1];
+		// 								}
+		// 								if(d001){
+		// 									d001 = block_data_pos[- dim0_offset - dim1_offset];
+		// 								}
+		// 								if(d010){
+		// 									d010 = block_data_pos[- dim0_offset - 1];
+		// 								}
+		// 								if(d011){
+		// 									d011 = block_data_pos[- dim0_offset];
+		// 								}
+		// 								if(d100){
+		// 									d100 = block_data_pos[- dim1_offset - 1];
+		// 								}
+		// 								if(d101){
+		// 									d101 = block_data_pos[- dim1_offset];
+		// 								}
+		// 								if(d110){
+		// 									d110 = block_data_pos[- 1];
+		// 								}
+		// 								if(type_ < intvRadius) type_ += 1;
+		// 								pred = d110 + d101 + d011 - d100 - d010 - d001 + d000;
+		// 								*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+		// 							}
+		// 							index ++;
+		// 							block_data_pos ++;
+		// 						}
+		// 						block_data_pos += dim1_offset - current_blockcount_z;
+		// 					}
+		// 					block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+		// 				}
+		// 				cur_unpred_count = unpredictable_count;
+		// 			}
+		// 			else{
+		// 				// decompress by regression
+		// 				{
+		// 					//restore regression coefficients
+		// 					float pred;
+		// 					int type_;
+		// 					for(int e=0; e<4; e++){
+		// 						// if(i == 0 && j == 0 && k == 19){
+		// 						// 	printf("~\n");
+		// 						// }
+		// 						type_ = coeff_type[e][coeff_index];
+		// 						if (type_ != 0){
+		// 							pred = last_coefficients[e];
+		// 							last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+		// 						}
+		// 						else{
+		// 							last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+		// 							coeff_unpred_data_count[e] ++;
+		// 						}
+		// 						if(fabs(last_coefficients[e]) > 10000){
+		// 							printf("%d %d %d-%d: pred %.4f type %d precision %.4g last_coefficients %.4g\n", i, j, k, e, pred, type_, precision[e], last_coefficients[e]);
+		// 							exit(0);
+		// 						}
+		// 					}
+		// 					coeff_index ++;
+		// 				}
+		// 				{
+		// 					float * block_data_pos = data_pos;
+		// 					float pred;
+		// 					int type_;
+		// 					size_t index = 0;
+		// 					size_t unpredictable_count = 0;
+		// 					for(size_t ii=0; ii<current_blockcount_x; ii++){
+		// 						for(size_t jj=0; jj<current_blockcount_y; jj++){
+		// 							for(size_t kk=0; kk<current_blockcount_z; kk++){
+		// 								if(block_data_pos - (*data) == 19470788){
+		// 									printf("dec stop\n");
+		// 								}
+
+		// 								type_ = type[index];
+		// 								if (type_ != 0){
+		// 									pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+		// 									*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+		// 								}
+		// 								else{
+		// 									*block_data_pos = unpred_data[unpredictable_count ++];
+		// 								}
+		// 								index ++;	
+		// 								block_data_pos ++;
+		// 							}
+		// 							block_data_pos += dim1_offset - current_blockcount_z;
+		// 						}
+		// 						block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+		// 					}
+		// 					cur_unpred_count = unpredictable_count;
+		// 				}
+		// 			}
+
+		// 			type += current_block_elements;
+		// 			indicator_pos ++;
+		// 			unpred_data += cur_unpred_count;
+		// 			// decomp_unpred += cur_unpred_count;
+		// 			// printf("block comp done, data_offset from %ld to %ld: diff %ld\n", *data, data_pos, data_pos - *data);
+		// 			// fflush(stdout);
+		// 		}
+		// 	}
+		// }
+
+		type = result_type;
+		// i == 0
+		{
+			// j == 0
+			{
+				// k == 0
+				{
+					data_pos = *data;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = 0;
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;						
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				// i == 0 j == 0 k != 0
+				for(size_t k=1; k<num_z; k++){
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j==0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_y * dim1_offset;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		} // end i==0
+		for(size_t i=1; i<num_x; i++){
+			// j == 0
+			{
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					data_pos = *data + offset_x * dim0_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j = 0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == intvRadius){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										if(type_ < intvRadius) type_ += 1;
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		}
+	}
+	else{
+		type = result_type;
+		// i == 0
+		{
+			// j == 0
+			{
+				// k == 0
+				{
+					data_pos = *data;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = 0;
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;						
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				// i == 0 j == 0 k != 0
+				for(size_t k=1; k<num_z; k++){
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j==0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_y * dim1_offset;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = early_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						// ii == 0
+						{
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] - block_data_pos[- dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						for(size_t ii=1; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		} // end i==0
+		for(size_t i=1; i<num_x; i++){
+			// j == 0
+			{
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					data_pos = *data + offset_x * dim0_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim0_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = early_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							// jj == 0
+							{
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							for(size_t jj=1; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}// end j = 0
+			for(size_t j=1; j<num_y; j++){
+				// k == 0
+				{
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = early_blockcount_z;
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								{
+									// kk == 0
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim0_offset - dim1_offset];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								for(size_t kk=1; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				} // end k == 0
+				for(size_t k=1; k<num_z; k++){
+					offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+					offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+					offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+					data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+					current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+					current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+					current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+
+					size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+					if(*indicator_pos){
+						// decompress by SZ
+						float * block_data_pos = data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<current_blockcount_x; ii++){
+							for(size_t jj=0; jj<current_blockcount_y; jj++){
+								for(size_t kk=0; kk<current_blockcount_z; kk++){
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[- 1] + block_data_pos[- dim1_offset] + block_data_pos[- dim0_offset] - block_data_pos[- dim1_offset - 1] - block_data_pos[- dim0_offset - 1] - block_data_pos[- dim0_offset - dim1_offset] + block_data_pos[- dim0_offset - dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+									block_data_pos ++;
+								}
+								block_data_pos += dim1_offset - current_blockcount_z;
+							}
+							block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float * block_data_pos = data_pos;
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<current_blockcount_x; ii++){
+								for(size_t jj=0; jj<current_blockcount_y; jj++){
+									for(size_t kk=0; kk<current_blockcount_z; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											*block_data_pos = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+										block_data_pos ++;
+									}
+									block_data_pos += dim1_offset - current_blockcount_z;
+								}
+								block_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					type += current_block_elements;
+					unpred_data += cur_unpred_count;
+				}
+			}
+		}
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(multisteps->hist_data, (*data), num_elements*sizeof(float));
+#endif	
+
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+}
+
+void decompressDataSeries_float_3D_random_access_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data){
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	size_t num_elements = r1 * r2 * r3;
+
+	*data = (float*)malloc(sizeof(float)*num_elements);
+
+	unsigned char * comp_data_pos = comp_data;
+
+	size_t block_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	// calculate block dims
+	size_t num_x, num_y, num_z;
+	num_x = (r1 - 1) / block_size + 1;
+	num_y = (r2 - 1) / block_size + 1;
+	num_z = (r3 - 1) / block_size + 1;
+
+	size_t max_num_block_elements = block_size * block_size * block_size;
+	size_t num_blocks = num_x * num_y * num_z;
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+
+	updateQuantizationInfo(intervals);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,comp_data_pos+sizeof(int), nodeCount);
+	comp_data_pos += sizeof(int) + tree_size;
+
+	float mean;
+	unsigned char use_mean;
+	memcpy(&use_mean, comp_data_pos, sizeof(unsigned char));
+	comp_data_pos += sizeof(unsigned char);
+	memcpy(&mean, comp_data_pos, sizeof(float));
+	comp_data_pos += sizeof(float);
+	size_t reg_count = 0;
+
+	unsigned char * indicator;
+	size_t indicator_bitlength = (num_blocks - 1)/8 + 1;
+	convertByteArray2IntArray_fast_1b(num_blocks, comp_data_pos, indicator_bitlength, &indicator);
+	comp_data_pos += indicator_bitlength;
+	for(size_t i=0; i<num_blocks; i++){
+		if(!indicator[i]) reg_count ++;
+	}
+
+	int coeff_intvRadius[4];
+	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+	int * coeff_type[4];
+	double precision[4];
+	float * coeff_unpred_data[4];
+	if(reg_count > 0){
+		for(int i=0; i<4; i++){
+			precision[i] = bytesToDouble(comp_data_pos);
+			comp_data_pos += sizeof(double);
+			coeff_intvRadius[i] = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			int stateNum = 2*coeff_intvRadius[i]*2;
+			HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+			int nodeCount = bytesToInt_bigEndian(comp_data_pos);
+			node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+sizeof(int), nodeCount);
+			comp_data_pos += sizeof(int) + tree_size;
+
+			coeff_type[i] = coeff_result_type + i * num_blocks;
+			size_t typeArray_size = bytesToSize(comp_data_pos);
+			decode(comp_data_pos + sizeof(size_t), reg_count, root, coeff_type[i]);
+			comp_data_pos += sizeof(size_t) + typeArray_size;
+			int coeff_unpred_count = bytesToInt_bigEndian(comp_data_pos);
+			comp_data_pos += sizeof(int);
+			coeff_unpred_data[i] = (float *) comp_data_pos;
+			comp_data_pos += coeff_unpred_count * sizeof(float);
+			SZ_ReleaseHuffman(huffmanTree);
+		}
+	}
+	float last_coefficients[4] = {0.0};
+	int coeff_unpred_data_count[4] = {0};
+	int coeff_index = 0;
+	updateQuantizationInfo(intervals);
+
+	size_t total_unpred;
+	memcpy(&total_unpred, comp_data_pos, sizeof(size_t));
+	comp_data_pos += sizeof(size_t);
+	float * unpred_data = (float *) comp_data_pos;
+	comp_data_pos += total_unpred * sizeof(float);
+
+	int * result_type = (int *) malloc(num_blocks*max_num_block_elements * sizeof(int));
+	decode(comp_data_pos, num_blocks*max_num_block_elements, root, result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	int intvRadius = exe_params->intvRadius;
+	
+	int * type;
+	float * data_pos = *data;
+	size_t cur_unpred_count;
+	unsigned char * indicator_pos = indicator;
+	int dec_buffer_size = block_size + 1;
+	float * dec_buffer = (float *) malloc(dec_buffer_size*dec_buffer_size*dec_buffer_size*sizeof(float));
+	memset(dec_buffer, 0, dec_buffer_size*dec_buffer_size*dec_buffer_size*sizeof(float));
+	float * block_data_pos_x = NULL;
+	float * block_data_pos_y = NULL;
+	float * block_data_pos_z = NULL;
+	int block_dim0_offset = dec_buffer_size*dec_buffer_size;
+	int block_dim1_offset = dec_buffer_size;
+	if(use_mean){
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = dec_buffer + dec_buffer_size*dec_buffer_size + dec_buffer_size + 1;
+					if(*indicator_pos){
+						// decompress by SZ
+						// cur_unpred_count = decompressDataSeries_float_3D_blocked_nonblock_pred(data_pos, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, i, j, k, realPrecision, type, unpred_data);
+						float * block_data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									block_data_pos = data_pos + ii*block_dim0_offset + jj*block_dim1_offset + kk;
+									type_ = type[index];
+									if(type_ == 1){
+										*block_data_pos = mean;
+									}
+									else if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[-1] + block_data_pos[-block_dim1_offset]+ block_data_pos[-block_dim0_offset] - block_data_pos[-block_dim1_offset - 1]
+												 - block_data_pos[-block_dim0_offset - 1] - block_data_pos[-block_dim0_offset - block_dim1_offset] + block_data_pos[-block_dim0_offset - block_dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+								}
+							}
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								// if(i == 0 && j == 0 && k == 19){
+								// 	printf("~\n");
+								// }
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<block_size; ii++){
+								for(size_t jj=0; jj<block_size; jj++){
+									for(size_t kk=0; kk<block_size; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+									}
+								}
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					unpred_data += cur_unpred_count;
+					// decomp_unpred += cur_unpred_count;
+					// printf("block comp done, data_offset from %ld to %ld: diff %ld\n", *data, data_pos, data_pos - *data);
+					// fflush(stdout);
+					type += block_size * block_size * block_size;
+
+					// mv data back
+					block_data_pos_x = *data + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					for(int ii=0; ii<block_size; ii++){
+						if(i*block_size + ii >= r1) break;
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							if(j*block_size + jj >= r2) break;
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								if(k*block_size + kk >= r3) break;
+								*block_data_pos_z = data_pos[ii*dec_buffer_size*dec_buffer_size + jj*dec_buffer_size + kk];
+								block_data_pos_z ++;
+							}
+							block_data_pos_y += dim1_offset;
+						}
+						block_data_pos_x += dim0_offset;
+					}
+
+				}
+			}
+		}
+
+	}
+	else{
+		type = result_type;
+		for(size_t i=0; i<num_x; i++){
+			for(size_t j=0; j<num_y; j++){
+				for(size_t k=0; k<num_z; k++){
+					data_pos = dec_buffer + dec_buffer_size*dec_buffer_size + dec_buffer_size + 1;
+					if(*indicator_pos){
+						// decompress by SZ
+						// cur_unpred_count = decompressDataSeries_float_3D_blocked_nonblock_pred(data_pos, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, i, j, k, realPrecision, type, unpred_data);
+						float * block_data_pos;
+						float pred;
+						size_t index = 0;
+						int type_;
+						size_t unpredictable_count = 0;
+						for(size_t ii=0; ii<block_size; ii++){
+							for(size_t jj=0; jj<block_size; jj++){
+								for(size_t kk=0; kk<block_size; kk++){
+									block_data_pos = data_pos + ii*block_dim0_offset + jj*block_dim1_offset + kk;
+									type_ = type[index];
+									if(type_ == 0){
+										*block_data_pos = unpred_data[unpredictable_count ++];
+									}
+									else{
+										pred = block_data_pos[-1] + block_data_pos[-block_dim1_offset]+ block_data_pos[-block_dim0_offset] - block_data_pos[-block_dim1_offset - 1]
+												 - block_data_pos[-block_dim0_offset - 1] - block_data_pos[-block_dim0_offset - block_dim1_offset] + block_data_pos[-block_dim0_offset - block_dim1_offset - 1];
+										*block_data_pos = pred + 2 * (type_ - intvRadius) * realPrecision;
+									}
+									index ++;
+								}
+							}
+						}
+						cur_unpred_count = unpredictable_count;
+					}
+					else{
+						// decompress by regression
+						{
+							//restore regression coefficients
+							float pred;
+							int type_;
+							for(int e=0; e<4; e++){
+								// if(i == 0 && j == 0 && k == 19){
+								// 	printf("~\n");
+								// }
+								type_ = coeff_type[e][coeff_index];
+								if (type_ != 0){
+									pred = last_coefficients[e];
+									last_coefficients[e] = pred + 2 * (type_ - coeff_intvRadius[e]) * precision[e];
+								}
+								else{
+									last_coefficients[e] = coeff_unpred_data[e][coeff_unpred_data_count[e]];
+									coeff_unpred_data_count[e] ++;
+								}
+							}
+							coeff_index ++;
+						}
+						{
+							float pred;
+							int type_;
+							size_t index = 0;
+							size_t unpredictable_count = 0;
+							for(size_t ii=0; ii<block_size; ii++){
+								for(size_t jj=0; jj<block_size; jj++){
+									for(size_t kk=0; kk<block_size; kk++){
+										type_ = type[index];
+										if (type_ != 0){
+											pred = last_coefficients[0] * ii + last_coefficients[1] * jj + last_coefficients[2] * kk + last_coefficients[3];
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = pred + 2 * (type_ - intvRadius) * realPrecision;
+										}
+										else{
+											data_pos[ii*block_dim0_offset + jj*block_dim1_offset + kk] = unpred_data[unpredictable_count ++];
+										}
+										index ++;	
+									}
+								}
+							}
+							cur_unpred_count = unpredictable_count;
+						}
+					}
+					indicator_pos ++;
+					unpred_data += cur_unpred_count;
+					// decomp_unpred += cur_unpred_count;
+					// printf("block comp done, data_offset from %ld to %ld: diff %ld\n", *data, data_pos, data_pos - *data);
+					// fflush(stdout);
+					type += block_size * block_size * block_size;
+					// mv data back
+					block_data_pos_x = *data + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+					for(int ii=0; ii<block_size; ii++){
+						if(i*block_size + ii >= r1) break;
+						block_data_pos_y = block_data_pos_x;
+						for(int jj=0; jj<block_size; jj++){
+							if(j*block_size + jj >= r2) break;
+							block_data_pos_z = block_data_pos_y;
+							for(int kk=0; kk<block_size; kk++){
+								if(k*block_size + kk >= r3) break;
+								*block_data_pos_z = data_pos[ii*dec_buffer_size*dec_buffer_size + jj*dec_buffer_size + kk];
+								block_data_pos_z ++;
+							}
+							block_data_pos_y += dim1_offset;
+						}
+						block_data_pos_x += dim0_offset;
+					}
+				}
+			}
+		}
+	}
+	free(dec_buffer);
+	free(coeff_result_type);
+
+	free(indicator);
+	free(result_type);
+}
diff --git a/thirdparty/SZ/sz/src/szd_float_pwr.c b/thirdparty/SZ/sz/src/szd_float_pwr.c
index 4ab18341f0fe136627920e0dd8c6264dff802a14..b761d4b99061534728e0a1a61efcfae0608ae4a5 100644
--- a/thirdparty/SZ/sz/src/szd_float_pwr.c
+++ b/thirdparty/SZ/sz/src/szd_float_pwr.c
@@ -16,6 +16,7 @@
 #include "sz.h"
 #include "Huffman.h"
 #include "sz_float_pwr.h"
+#include "utility.h"
 //#include "rw.h"
 //
 #pragma GCC diagnostic push
@@ -1349,4 +1350,74 @@ void decompressDataSeries_float_1D_pwrgroup(float** data, size_t dataSeriesLengt
 	free(groupErrorBounds);
 	free(groupID);
 }
+
+void decompressDataSeries_float_1D_pwr_pre_log(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps) {
+
+	decompressDataSeries_float_1D(data, dataSeriesLength, tdps);
+	float threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		sz_lossless_decompress(confparams_dec->losslessCompressor, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+			if(signs[i]) (*data)[i] = -((*data)[i]);
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+		}
+	}
+
+}
+
+void decompressDataSeries_float_2D_pwr_pre_log(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps) {
+
+	size_t dataSeriesLength = r1 * r2;
+	decompressDataSeries_float_2D(data, r1, r2, tdps);
+	float threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		sz_lossless_decompress(confparams_dec->losslessCompressor, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+			if(signs[i]) (*data)[i] = -((*data)[i]);
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+		}
+	}
+
+}
+
+void decompressDataSeries_float_3D_pwr_pre_log(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps) {
+
+	size_t dataSeriesLength = r1 * r2 * r3;
+	decompressDataSeries_float_3D(data, r1, r2, r3, tdps);
+	float threshold = tdps->minLogValue;
+	if(tdps->pwrErrBoundBytes_size > 0){
+		unsigned char * signs;
+		sz_lossless_decompress(confparams_dec->losslessCompressor, tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size, &signs, dataSeriesLength);
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+			if(signs[i]) (*data)[i] = -((*data)[i]);
+		}
+		free(signs);
+	}
+	else{
+		for(size_t i=0; i<dataSeriesLength; i++){
+			if((*data)[i] < threshold) (*data)[i] = 0;
+			else (*data)[i] = exp2((*data)[i]);
+		}
+	}
+}
 #pragma GCC diagnostic pop
diff --git a/thirdparty/SZ/sz/src/szd_int16.c b/thirdparty/SZ/sz/src/szd_int16.c
index 3c402dca944a658172e6c0db502961fb9c2a1721..1198e05fd2c6a48b526e327c7cfe56e4c069b7de 100644
--- a/thirdparty/SZ/sz/src/szd_int16.c
+++ b/thirdparty/SZ/sz/src/szd_int16.c
@@ -15,6 +15,7 @@
 #include "sz.h"
 #include "szd_int16.h"
 #include "Huffman.h"
+#include "utility.h"
 
 /**
  * 
@@ -32,10 +33,10 @@ int SZ_decompress_args_int16(int16_t** newData, size_t r5, size_t r4, size_t r3,
 	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(int16_t)+exe_params->SZ_SIZE_TYPE;
 	unsigned char* szTmpBytes;	
 		
-		if(cmpSize!=4+2+4+MetaDataByteLength && cmpSize!=4+2+8+MetaDataByteLength)
+	if(cmpSize!=4+2+4+MetaDataByteLength && cmpSize!=4+2+8+MetaDataByteLength)
 	{
-		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
-		if(isZlib)
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
 			confparams_dec->szMode = SZ_BEST_COMPRESSION;
 		else
 			confparams_dec->szMode = SZ_BEST_SPEED;		
@@ -48,7 +49,7 @@ int SZ_decompress_args_int16(int16_t** newData, size_t r5, size_t r4, size_t r3,
 		{
 			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
 				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
-			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
 			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
 			//memcpy(szTmpBytes, tmpBytes, tmpSize);
 			//free(tmpBytes); //release useless memory		
diff --git a/thirdparty/SZ/sz/src/szd_int32.c b/thirdparty/SZ/sz/src/szd_int32.c
index 43dc74e170671d4a873a849f97ae4a38dce06aa1..b5f31b09aba44de0a1cc1687cb07bd405f2136b1 100644
--- a/thirdparty/SZ/sz/src/szd_int32.c
+++ b/thirdparty/SZ/sz/src/szd_int32.c
@@ -15,6 +15,7 @@
 #include "sz.h"
 #include "szd_int32.h"
 #include "Huffman.h"
+#include "utility.h"
 
 /**
  * 
@@ -34,8 +35,8 @@ int SZ_decompress_args_int32(int32_t** newData, size_t r5, size_t r4, size_t r3,
 		
 	if(cmpSize!=4+4+4+MetaDataByteLength && cmpSize!=4+4+8+MetaDataByteLength)
 	{
-		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
-		if(isZlib)
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
 			confparams_dec->szMode = SZ_BEST_COMPRESSION;
 		else
 			confparams_dec->szMode = SZ_BEST_SPEED;		
@@ -48,7 +49,7 @@ int SZ_decompress_args_int32(int32_t** newData, size_t r5, size_t r4, size_t r3,
 		{
 			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
 				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
-			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
 			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
 			//memcpy(szTmpBytes, tmpBytes, tmpSize);
 			//free(tmpBytes); //release useless memory		
diff --git a/thirdparty/SZ/sz/src/szd_int64.c b/thirdparty/SZ/sz/src/szd_int64.c
index aaa4a533fc195c4f2d52174cf7de5a333629a700..07a054f54a196f31fe6e9b3ab1eafc532cdad4cf 100644
--- a/thirdparty/SZ/sz/src/szd_int64.c
+++ b/thirdparty/SZ/sz/src/szd_int64.c
@@ -15,6 +15,7 @@
 #include "sz.h"
 #include "szd_int64.h"
 #include "Huffman.h"
+#include "utility.h"
 
 /**
  * 
@@ -34,8 +35,8 @@ int SZ_decompress_args_int64(int64_t** newData, size_t r5, size_t r4, size_t r3,
 		
 	if(cmpSize!=4+8+4+MetaDataByteLength && cmpSize!=4+8+8+MetaDataByteLength)
 	{
-		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
-		if(isZlib)
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
 			confparams_dec->szMode = SZ_BEST_COMPRESSION;
 		else
 			confparams_dec->szMode = SZ_BEST_SPEED;		
@@ -48,7 +49,7 @@ int SZ_decompress_args_int64(int64_t** newData, size_t r5, size_t r4, size_t r3,
 		{
 			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
 				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
-			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
 			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
 			//memcpy(szTmpBytes, tmpBytes, tmpSize);
 			//free(tmpBytes); //release useless memory		
diff --git a/thirdparty/SZ/sz/src/szd_int8.c b/thirdparty/SZ/sz/src/szd_int8.c
index 758e91733d8b9f0b34fef94e2262748c278c49d2..850b4595b7501e8651c43efa3b48d77b5f4f12eb 100644
--- a/thirdparty/SZ/sz/src/szd_int8.c
+++ b/thirdparty/SZ/sz/src/szd_int8.c
@@ -15,6 +15,7 @@
 #include "sz.h"
 #include "szd_int8.h"
 #include "Huffman.h"
+#include "utility.h"
 
 /**
  * 
@@ -34,8 +35,8 @@ int SZ_decompress_args_int8(int8_t** newData, size_t r5, size_t r4, size_t r3, s
 		
 	if(cmpSize!=4+1+4+MetaDataByteLength && cmpSize!=4+1+8+MetaDataByteLength)
 	{
-		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
-		if(isZlib)
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
 			confparams_dec->szMode = SZ_BEST_COMPRESSION;
 		else
 			confparams_dec->szMode = SZ_BEST_SPEED;		
@@ -48,7 +49,7 @@ int SZ_decompress_args_int8(int8_t** newData, size_t r5, size_t r4, size_t r3, s
 		{
 			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
 				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
-			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
 			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
 			//memcpy(szTmpBytes, tmpBytes, tmpSize);
 			//free(tmpBytes); //release useless memory		
diff --git a/thirdparty/SZ/sz/src/szd_uint16.c b/thirdparty/SZ/sz/src/szd_uint16.c
index bdc746990aa25d1566bc5be27b949e38da305b16..551eecfa326ae73c2b42ce34d8ff51fd9774f12a 100644
--- a/thirdparty/SZ/sz/src/szd_uint16.c
+++ b/thirdparty/SZ/sz/src/szd_uint16.c
@@ -15,6 +15,7 @@
 #include "sz.h"
 #include "szd_uint16.h"
 #include "Huffman.h"
+#include "utility.h"
 
 /**
  * 
@@ -34,8 +35,8 @@ int SZ_decompress_args_uint16(uint16_t** newData, size_t r5, size_t r4, size_t r
 		
 	if(cmpSize!=4+2+4+MetaDataByteLength && cmpSize!=4+2+8+MetaDataByteLength)
 	{
-		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
-		if(isZlib)
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
 			confparams_dec->szMode = SZ_BEST_COMPRESSION;
 		else
 			confparams_dec->szMode = SZ_BEST_SPEED;		
@@ -48,7 +49,7 @@ int SZ_decompress_args_uint16(uint16_t** newData, size_t r5, size_t r4, size_t r
 		{
 			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
 				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
-			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
 			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
 			//memcpy(szTmpBytes, tmpBytes, tmpSize);
 			//free(tmpBytes); //release useless memory		
diff --git a/thirdparty/SZ/sz/src/szd_uint32.c b/thirdparty/SZ/sz/src/szd_uint32.c
index 795eabe6248b4ac4460008a0aa9ae511a514486d..04e8049f9dc9f3a8cf6ba01aa0fc4bb691b4d735 100644
--- a/thirdparty/SZ/sz/src/szd_uint32.c
+++ b/thirdparty/SZ/sz/src/szd_uint32.c
@@ -15,6 +15,7 @@
 #include "sz.h"
 #include "szd_uint32.h"
 #include "Huffman.h"
+#include "utility.h"
 
 /**
  * 
@@ -34,8 +35,8 @@ int SZ_decompress_args_uint32(uint32_t** newData, size_t r5, size_t r4, size_t r
 		
 	if(cmpSize!=4+4+4+MetaDataByteLength && cmpSize!=4+4+8+MetaDataByteLength)
 	{
-		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
-		if(isZlib)
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
 			confparams_dec->szMode = SZ_BEST_COMPRESSION;
 		else
 			confparams_dec->szMode = SZ_BEST_SPEED;		
@@ -48,7 +49,7 @@ int SZ_decompress_args_uint32(uint32_t** newData, size_t r5, size_t r4, size_t r
 		{
 			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
 				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
-			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
 			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
 			//memcpy(szTmpBytes, tmpBytes, tmpSize);
 			//free(tmpBytes); //release useless memory		
diff --git a/thirdparty/SZ/sz/src/szd_uint64.c b/thirdparty/SZ/sz/src/szd_uint64.c
index df2b8383ea60dbccc1e868ed39ce0f46021546fe..84d57168c4f7eed0bd49bf60cf8b8a3d19271b27 100644
--- a/thirdparty/SZ/sz/src/szd_uint64.c
+++ b/thirdparty/SZ/sz/src/szd_uint64.c
@@ -15,6 +15,7 @@
 #include "sz.h"
 #include "szd_uint64.h"
 #include "Huffman.h"
+#include "utility.h"
 
 /**
  * 
@@ -34,8 +35,8 @@ int SZ_decompress_args_uint64(uint64_t** newData, size_t r5, size_t r4, size_t r
 		
 	if(cmpSize!=4+8+4+MetaDataByteLength && cmpSize!=4+8+8+MetaDataByteLength)
 	{
-		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
-		if(isZlib)
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
 			confparams_dec->szMode = SZ_BEST_COMPRESSION;
 		else
 			confparams_dec->szMode = SZ_BEST_SPEED;		
@@ -48,7 +49,7 @@ int SZ_decompress_args_uint64(uint64_t** newData, size_t r5, size_t r4, size_t r
 		{
 			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
 				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
-			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
 			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
 			//memcpy(szTmpBytes, tmpBytes, tmpSize);
 			//free(tmpBytes); //release useless memory		
diff --git a/thirdparty/SZ/sz/src/szd_uint8.c b/thirdparty/SZ/sz/src/szd_uint8.c
index cd616352bc950d4723b16357f7fc9e5dec820fe8..8b992bc2d4d9400325936648479e2cf31151c5f4 100644
--- a/thirdparty/SZ/sz/src/szd_uint8.c
+++ b/thirdparty/SZ/sz/src/szd_uint8.c
@@ -15,6 +15,7 @@
 #include "sz.h"
 #include "szd_uint8.h"
 #include "Huffman.h"
+#include "utility.h"
 
 /**
  * 
@@ -34,8 +35,8 @@ int SZ_decompress_args_uint8(uint8_t** newData, size_t r5, size_t r4, size_t r3,
 		
 	if(cmpSize!=4+1+4+MetaDataByteLength && cmpSize!=4+1+8+MetaDataByteLength)
 	{
-		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
-		if(isZlib)
+		confparams_dec->losslessCompressor = is_lossless_compressed_data(cmpBytes, cmpSize);
+		if(confparams_dec->losslessCompressor!=-1)
 			confparams_dec->szMode = SZ_BEST_COMPRESSION;
 		else
 			confparams_dec->szMode = SZ_BEST_SPEED;		
@@ -48,7 +49,7 @@ int SZ_decompress_args_uint8(uint8_t** newData, size_t r5, size_t r4, size_t r3,
 		{
 			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
 				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
-			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			tmpSize = sz_lossless_decompress(confparams_dec->losslessCompressor, cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
 			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
 			//memcpy(szTmpBytes, tmpBytes, tmpSize);
 			//free(tmpBytes); //release useless memory		
diff --git a/thirdparty/SZ/sz/src/szf.c b/thirdparty/SZ/sz/src/szf.c
index e3cca0bc2e26211b422acc5b6588e766fe548f92..43fe0b1db2988d4d597fe7a6471fadbf29e74233 100644
--- a/thirdparty/SZ/sz/src/szf.c
+++ b/thirdparty/SZ/sz/src/szf.c
@@ -176,70 +176,70 @@ void sz_compress_d5_double_rev_(double* data, double *reservedValue, unsigned ch
 
 void sz_compress_d1_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1)
 {
-	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, 0, 0, 0, *r1);
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, 0, *r1);
 	memcpy(bytes, tmp_bytes, *outSize);
 	free(tmp_bytes);
 }
 
 void sz_compress_d2_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2)
 {
-	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, 0, 0, *r2, *r1);
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, *r2, *r1);
 	memcpy(bytes, tmp_bytes, *outSize);
 	free(tmp_bytes);
 }
 
 void sz_compress_d3_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
 {
-	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, 0, *r3, *r2, *r1);
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, *r3, *r2, *r1);
 	memcpy(bytes, tmp_bytes, *outSize);
 	free(tmp_bytes);
 }
 
 void sz_compress_d4_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
 {
-	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, *r4, *r3, *r2, *r1);
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, *r4, *r3, *r2, *r1);
 	memcpy(bytes, tmp_bytes, *outSize);
 	free(tmp_bytes);
 }
 
 void sz_compress_d5_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
 {
-	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, *r5, *r4, *r3, *r2, *r1);
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, *r5, *r4, *r3, *r2, *r1);
 	memcpy(bytes, tmp_bytes, *outSize);
 	free(tmp_bytes);
 }
 
 void sz_compress_d1_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1)
 {
-	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, 0, 0, 0, *r1);
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, 0, *r1);
 	memcpy(bytes, tmp_bytes, *outSize);
 	free(tmp_bytes);
 }
 
 void sz_compress_d2_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2)
 {
-	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, 0, 0, *r2, *r1);
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, *r2, *r1);
 	memcpy(bytes, tmp_bytes, *outSize);
 	free(tmp_bytes);
 }
 
 void sz_compress_d3_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
 {
-	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, 0, *r3, *r2, *r1);
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, *r3, *r2, *r1);
 	memcpy(bytes, tmp_bytes, *outSize);
 	free(tmp_bytes);
 }
 
 void sz_compress_d4_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
 {
-	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, *r4, *r3, *r2, *r1);
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, *r4, *r3, *r2, *r1);
 	memcpy(bytes, tmp_bytes, *outSize);
 	free(tmp_bytes);
 }
 
 void sz_compress_d5_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
 {
-	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, *r5, *r4, *r3, *r2, *r1);
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, *r5, *r4, *r3, *r2, *r1);
 	memcpy(bytes, tmp_bytes, *outSize);
 	free(tmp_bytes);
 }
@@ -411,7 +411,7 @@ void sz_batchaddvar_d1_float_(char* varName, int *len, float* data, int *errBoun
     for(i=0;i<*len;i++)
         s2[i]=varName[i];
     s2[*len]='\0';		
-	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, 0, *r1);
+	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, 0, *r1);
 }
 void sz_batchaddvar_d2_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2)
 {
@@ -420,7 +420,7 @@ void sz_batchaddvar_d2_float_(char* varName, int *len, float* data, int *errBoun
     for(i=0;i<*len;i++)
         s2[i]=varName[i];
     s2[*len]='\0';		
-	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, *r2, *r1);
+	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, *r2, *r1);
 }
 void sz_batchaddvar_d3_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
 {
@@ -429,7 +429,7 @@ void sz_batchaddvar_d3_float_(char* varName, int *len, float* data, int *errBoun
     for(i=0;i<*len;i++)
         s2[i]=varName[i];
     s2[*len]='\0';		
-	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, *r3, *r2, *r1);
+	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, *r3, *r2, *r1);
 }
 void sz_batchaddvar_d4_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
 {
@@ -438,7 +438,7 @@ void sz_batchaddvar_d4_float_(char* varName, int *len, float* data, int *errBoun
     for(i=0;i<*len;i++)
         s2[i]=varName[i];
     s2[*len]='\0';		
-	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, *r4, *r3, *r2, *r1);
+	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, *r4, *r3, *r2, *r1);
 }
 void sz_batchaddvar_d5_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
 {
@@ -447,7 +447,7 @@ void sz_batchaddvar_d5_float_(char* varName, int *len, float* data, int *errBoun
     for(i=0;i<*len;i++)
         s2[i]=varName[i];
     s2[*len]='\0';		
-	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, *r5, *r4, *r3, *r2, *r1);
+	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, *r5, *r4, *r3, *r2, *r1);
 }
 void sz_batchaddvar_d1_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1)
 {
@@ -456,7 +456,7 @@ void sz_batchaddvar_d1_double_(char* varName, int *len, double* data, int *errBo
     for(i=0;i<*len;i++)
         s2[i]=varName[i];
     s2[*len]='\0';		
-	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, 0, *r1);
+	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, 0, *r1);
 }
 void sz_batchaddvar_d2_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2)
 {
@@ -465,7 +465,7 @@ void sz_batchaddvar_d2_double_(char* varName, int *len, double* data, int *errBo
     for(i=0;i<*len;i++)
         s2[i]=varName[i];
     s2[*len]='\0';		
-	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, *r2, *r1);
+	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, 0, *r2, *r1);
 }
 void sz_batchaddvar_d3_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
 {
@@ -474,7 +474,7 @@ void sz_batchaddvar_d3_double_(char* varName, int *len, double* data, int *errBo
     for(i=0;i<*len;i++)
         s2[i]=varName[i];
     s2[*len]='\0';		
-	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, *r3, *r2, *r1);
+	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, 0, *r3, *r2, *r1);
 }
 void sz_batchaddvar_d4_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
 {
@@ -483,7 +483,7 @@ void sz_batchaddvar_d4_double_(char* varName, int *len, double* data, int *errBo
     for(i=0;i<*len;i++)
         s2[i]=varName[i];
     s2[*len]='\0';		
-	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, *r4, *r3, *r2, *r1);
+	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 0, *r4, *r3, *r2, *r1);
 }
 void sz_batchaddvar_d5_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
 {
@@ -492,7 +492,7 @@ void sz_batchaddvar_d5_double_(char* varName, int *len, double* data, int *errBo
     for(i=0;i<*len;i++)
         s2[i]=varName[i];
     s2[*len]='\0';		
-	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, *r5, *r4, *r3, *r2, *r1);
+	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, *r5, *r4, *r3, *r2, *r1);
 }
 void sz_batchdelvar_c_(char* varName, int *len, int *errState)
 {
@@ -503,15 +503,18 @@ void sz_batchdelvar_c_(char* varName, int *len, int *errState)
     s2[*len]='\0';
 	*errState = SZ_batchDelVar(s2);
 }
+
+/*@deprecated*/
 void sz_batch_compress_c_(unsigned char* bytes, size_t *outSize)
 {
-	unsigned char* tmp_bytes = SZ_batch_compress(outSize);
-	memcpy(bytes, tmp_bytes, *outSize);
-	free(tmp_bytes);
+	//unsigned char* tmp_bytes = SZ_batch_compress(outSize);
+	//memcpy(bytes, tmp_bytes, *outSize);
+	//free(tmp_bytes);
 }
+/*@deprecated*/
 void sz_batch_decompress_c_(unsigned char* bytes, size_t *byteLength, int *ierr)
 {
-	SZ_batch_decompress(bytes, *byteLength, ierr);
+	//SZ_batch_decompress(bytes, *byteLength, ierr);
 }
 
 void sz_getvardim_c_(char* varName, int *len, int *dim, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)