diff --git a/GNUmakefile b/GNUmakefile
index 7ca615ae58f610c8680811aef7e08de595ab51ea..883bda375e11ba004ad9057c58f4a6346042734e 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -63,10 +63,12 @@ $(FEDIR)/%.o: thirdparty/blosc/%.c | $(FEDIR)
 $(FEDIR)/%.o: %.cxx | $(FEDIR)
 	$(CXX) $(FE_CFLAGS) $(FE_CPPFLAGS) -c -o $@ $<
 
-$(FEDIR)/GenericIOPrint: $(FEDIR)/GenericIOPrint.o $(FEDIR)/GenericIO.o  $(FEDIR)/blosc.o $(FEDIR)/blosclz.o $(FEDIR)/shuffle.o
+FE_BLOSC_O := $(FEDIR)/blosc.o $(FEDIR)/blosclz.o $(FEDIR)/shuffle.o $(FEDIR)/bitshuffle-generic.o $(FEDIR)/shuffle-generic.o
+
+$(FEDIR)/GenericIOPrint: $(FEDIR)/GenericIOPrint.o $(FEDIR)/GenericIO.o $(FE_BLOSC_O)
 	$(CXX) $(FE_CFLAGS) -o $@ $^ 
 
-$(FEDIR)/GenericIOVerify: $(FEDIR)/GenericIOVerify.o $(FEDIR)/GenericIO.o $(FEDIR)/blosc.o $(FEDIR)/blosclz.o $(FEDIR)/shuffle.o
+$(FEDIR)/GenericIOVerify: $(FEDIR)/GenericIOVerify.o $(FEDIR)/GenericIO.o $(FE_BLOSC_O)
 	$(CXX) $(FE_CFLAGS) -o $@ $^ 
 
 FE_UNAME := $(shell uname -s)
@@ -98,16 +100,18 @@ $(MPIDIR)/%.o: thirdparty/blosc/%.c | $(MPIDIR)
 $(MPIDIR)/%.o: %.cxx | $(MPIDIR)
 	$(MPICXX) $(MPI_CFLAGS) $(MPI_CPPFLAGS) -c -o $@ $<
 
-$(MPIDIR)/GenericIOPrint: $(MPIDIR)/GenericIOPrint.o $(MPIDIR)/GenericIO.o $(MPIDIR)/blosc.o $(MPIDIR)/blosclz.o $(MPIDIR)/shuffle.o
+MPI_BLOSC_O := $(MPIDIR)/blosc.o $(MPIDIR)/blosclz.o $(MPIDIR)/shuffle.o $(MPIDIR)/bitshuffle-generic.o $(MPIDIR)/shuffle-generic.o
+
+$(MPIDIR)/GenericIOPrint: $(MPIDIR)/GenericIOPrint.o $(MPIDIR)/GenericIO.o $(MPI_BLOSC_O)
 	$(MPICXX) $(MPI_CFLAGS) -o $@ $^ 
 
-$(MPIDIR)/GenericIOVerify: $(MPIDIR)/GenericIOVerify.o $(MPIDIR)/GenericIO.o $(MPIDIR)/blosc.o $(MPIDIR)/blosclz.o $(MPIDIR)/shuffle.o
+$(MPIDIR)/GenericIOVerify: $(MPIDIR)/GenericIOVerify.o $(MPIDIR)/GenericIO.o $(MPI_BLOSC_O)
 	$(MPICXX) $(MPI_CFLAGS) -o $@ $^ 
 
-$(MPIDIR)/GenericIOBenchmarkRead: $(MPIDIR)/GenericIOBenchmarkRead.o $(MPIDIR)/GenericIO.o $(MPIDIR)/blosc.o $(MPIDIR)/blosclz.o $(MPIDIR)/shuffle.o
+$(MPIDIR)/GenericIOBenchmarkRead: $(MPIDIR)/GenericIOBenchmarkRead.o $(MPIDIR)/GenericIO.o $(MPI_BLOSC_O)
 	$(MPICXX) $(MPI_CFLAGS) -o $@ $^ 
 
-$(MPIDIR)/GenericIOBenchmarkWrite: $(MPIDIR)/GenericIOBenchmarkWrite.o $(MPIDIR)/GenericIO.o $(MPIDIR)/blosc.o $(MPIDIR)/blosclz.o $(MPIDIR)/shuffle.o
+$(MPIDIR)/GenericIOBenchmarkWrite: $(MPIDIR)/GenericIOBenchmarkWrite.o $(MPIDIR)/GenericIO.o $(MPI_BLOSC_O)
 	$(MPICXX) $(MPI_CFLAGS) -o $@ $^ 
 
 frontend-progs: $(FEDIR)/GenericIOPrint $(FEDIR)/GenericIOVerify
diff --git a/thirdparty/blosc/ANNOUNCE.rst b/thirdparty/blosc/ANNOUNCE.rst
index 4e1aa588acb57ea89a0822ca753dc383f73c4dd9..1d8be28bcb0ce75072e7bc143d7e7aec6cffaf4a 100644
--- a/thirdparty/blosc/ANNOUNCE.rst
+++ b/thirdparty/blosc/ANNOUNCE.rst
@@ -1,44 +1,36 @@
 ===============================================================
- Announcing Blosc 1.2.3
- A blocking, shuffling and lossless compression library
+ Announcing c-blosc 1.10.0
+ A blocking, shuffling and lossless compression library for C
 ===============================================================
 
 What is new?
 ============
 
-New `blosc_init()` and `blosc_destroy()` functions have been added so
-that the global lock can be initialized safely. These new functions
-will also allow for other kind of initializations/destructions in the
-future.
+This release introduces support for the new Zstd codec. Zstd is meant to
+achieve larger compression ratios than Zlib, but with higher speeds. We
+are talking about a well-balanced codec that should see a lot of use
+among Blosc users. There is a blog about what you can expect of it in:
 
-Existing applications using Blosc do not need to start using the new
-functions right away, as long as they calling `blosc_set_nthreads()`
-previous to anything else.  However, using them is highly recommended.
-
-Thanks to Oscar Villellas for the init/destroy suggestion, it is a
-nice idea indeed!
+http://blosc.org/blog/zstd-has-just-landed-in-blosc.html
 
 For more info, please see the release notes in:
 
-https://github.com/FrancescAlted/blosc/wiki/Release-notes
+https://github.com/Blosc/c-blosc/blob/master/RELEASE_NOTES.rst
+
 
 What is it?
 ===========
 
-Blosc (http://www.blosc.org) is a high performance compressor
+Blosc (http://www.blosc.org) is a high performance meta-compressor
 optimized for binary data.  It has been designed to transmit data to
 the processor cache faster than the traditional, non-compressed,
 direct memory fetch approach via a memcpy() OS call.
 
-Blosc is the first compressor (that I'm aware of) that is meant not
-only to reduce the size of large datasets on-disk or in-memory, but
-also to accelerate object manipulations that are memory-bound.
+Blosc has internal support for different compressors like its internal
+BloscLZ, but also LZ4, LZ4HC, Snappy and Zlib.  This way these can
+automatically leverage the multithreading and pre-filtering
+(shuffling) capabilities that comes with Blosc.
 
-There is also a handy command line for Blosc called Bloscpack
-(https://github.com/esc/bloscpack) that allows you to compress large
-binary datafiles on-disk.  Although the format for Bloscpack has not
-stabilized yet, it allows you to effectively use Blosc from you
-favorite shell.
 
 Download sources
 ================
@@ -49,11 +41,12 @@ http://www.blosc.org/
 
 and proceed from there.  The github repository is over here:
 
-https://github.com/FrancescAlted/blosc
+https://github.com/Blosc
 
 Blosc is distributed using the MIT license, see LICENSES/BLOSC.txt for
 details.
 
+
 Mailing list
 ============
 
@@ -64,10 +57,3 @@ http://groups.google.es/group/blosc
 
 
 Enjoy Data!
-
-
-.. Local Variables:
-.. mode: rst
-.. coding: utf-8
-.. fill-column: 70
-.. End:
diff --git a/thirdparty/blosc/LICENSES/BITSHUFFLE.txt b/thirdparty/blosc/LICENSES/BITSHUFFLE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1365ed69b063ad4f0f2e5977b90219c6c7a74836
--- /dev/null
+++ b/thirdparty/blosc/LICENSES/BITSHUFFLE.txt
@@ -0,0 +1,21 @@
+Bitshuffle - Filter for improving compression of typed binary data.
+
+Copyright (c) 2014 Kiyoshi Masui (kiyo@physics.ubc.ca)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/thirdparty/blosc/LICENSES/BLOSC.txt b/thirdparty/blosc/LICENSES/BLOSC.txt
index 5b0feb7c9593cc65be5c6aa763b16b58257de5f8..6956017a8cd90e17a2a78f5e8ed0fd5eebab323b 100644
--- a/thirdparty/blosc/LICENSES/BLOSC.txt
+++ b/thirdparty/blosc/LICENSES/BLOSC.txt
@@ -1,7 +1,6 @@
 Blosc - A blocking, shuffling and lossless compression library
 
-Copyright (C) 2009-2012 Francesc Alted <faltet@gmail.com>
-Copyright (C) 2013      Francesc Alted <faltet@gmail.com>
+Copyright (C) 2009-2016 Francesc Alted <francesc@blosc.org>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -20,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
-
diff --git a/thirdparty/blosc/LICENSES/LZ4.txt b/thirdparty/blosc/LICENSES/LZ4.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2383e10348dc1407703ec56a66a20e1053f60125
--- /dev/null
+++ b/thirdparty/blosc/LICENSES/LZ4.txt
@@ -0,0 +1,32 @@
+LZ4 - Fast LZ compression algorithm
+
+Copyright (C) 2011-2014, Yann Collet.
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+- LZ4 source repository : http://code.google.com/p/lz4/
+
diff --git a/thirdparty/blosc/LICENSES/SNAPPY.txt b/thirdparty/blosc/LICENSES/SNAPPY.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8d6bd9fed4e437af46e342ff161cb2bc11fa0c61
--- /dev/null
+++ b/thirdparty/blosc/LICENSES/SNAPPY.txt
@@ -0,0 +1,28 @@
+Copyright 2011, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/thirdparty/blosc/LICENSES/STDINT.txt b/thirdparty/blosc/LICENSES/STDINT.txt
index 7e9941ad30e9cb50e8f09578e0a782fa8f180afa..c28001d13f63fde80144eb409e109360ae79ea66 100644
--- a/thirdparty/blosc/LICENSES/STDINT.txt
+++ b/thirdparty/blosc/LICENSES/STDINT.txt
@@ -1,4 +1,7 @@
-Copyright (c) 2006-2008 Alexander Chemeris
+ISO C9x  compliant stdint.h for Microsoft Visual Studio
+Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
+
+ Copyright (c) 2006-2013 Alexander Chemeris
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
@@ -10,8 +13,9 @@ modification, are permitted provided that the following conditions are met:
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
 
-  3. The name of the author may be used to endorse or promote products
-     derived from this software without specific prior written permission.
+  3. Neither the name of the product nor the names of its contributors may
+     be used to endorse or promote products derived from this software
+     without specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
@@ -22,4 +26,4 @@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/thirdparty/blosc/LICENSES/ZLIB.txt b/thirdparty/blosc/LICENSES/ZLIB.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5d74f5ce5d7e29ea64fc6f44db3158d97cad4a34
--- /dev/null
+++ b/thirdparty/blosc/LICENSES/ZLIB.txt
@@ -0,0 +1,22 @@
+Copyright notice:
+
+ (C) 1995-2013 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
diff --git a/thirdparty/blosc/README.rst b/thirdparty/blosc/README.rst
index 4a625e38093c44caf7b84e7af4c40981a0341311..629d185c5c8cbdbd503ea26ed93b7710cceabc78 100644
--- a/thirdparty/blosc/README.rst
+++ b/thirdparty/blosc/README.rst
@@ -3,8 +3,22 @@
 ===============================================================
 
 :Author: Francesc Alted
-:Contact: faltet@gmail.com
+:Contact: francesc@blosc.org
 :URL: http://www.blosc.org
+:Gitter: |gitter|
+:Travis CI: |travis|
+:Appveyor: |appveyor|
+
+.. |gitter| image:: https://badges.gitter.im/Blosc/c-blosc.svg
+        :alt: Join the chat at https://gitter.im/Blosc/c-blosc
+        :target: https://gitter.im/Blosc/c-blosc?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
+
+.. |travis| image:: https://travis-ci.org/Blosc/c-blosc.svg?branch=master
+        :target: https://travis-ci.org/Blosc/c-blosc
+
+.. |appveyor| image:: https://ci.appveyor.com/api/projects/status/3mlyjc1ak0lbkmte?svg=true
+        :target: https://ci.appveyor.com/project/FrancescAlted/c-blosc/branch/master
+
 
 What is it?
 ===========
@@ -17,41 +31,57 @@ that is meant not only to reduce the size of large datasets on-disk or
 in-memory, but also to accelerate memory-bound computations.
 
 It uses the blocking technique (as described in [2]_) to reduce
-activity on the memory bus as much as possible.  In short, this
+activity on the memory bus as much as possible. In short, this
 technique works by dividing datasets in blocks that are small enough
 to fit in caches of modern processors and perform compression /
 decompression there.  It also leverages, if available, SIMD
-instructions (SSE2) and multi-threading capabilities of CPUs, in order
-to accelerate the compression / decompression process to a maximum.
-
-You can see some recent benchmarks about Blosc performance in [3]_
+instructions (SSE2, AVX2) and multi-threading capabilities of CPUs, in
+order to accelerate the compression / decompression process to a
+maximum.
+
+Blosc is actually a metacompressor, that meaning that it can use a range
+of compression libraries for performing the actual
+compression/decompression. Right now, it comes with integrated support
+for BloscLZ (the original one), LZ4, LZ4HC, Snappy, Zlib and Zstd. Blosc
+comes with full sources for all compressors, so in case it does not find
+the libraries installed in your system, it will compile from the
+included sources and they will be integrated into the Blosc library
+anyway. That means that you can trust in having all supported
+compressors integrated in Blosc in all supported platforms.
+
+You can see some benchmarks about Blosc performance in [3]_
 
 Blosc is distributed using the MIT license, see LICENSES/BLOSC.txt for
 details.
 
 .. [1] http://www.blosc.org
 .. [2] http://blosc.org/docs/StarvingCPUs-CISE-2010.pdf
-.. [3] http://blosc.org/trac/wiki/SyntheticBenchmarks
+.. [3] http://blosc.org/synthetic-benchmarks.html
 
 Meta-compression and other advantages over existing compressors
 ===============================================================
 
-Blosc is not like other compressors: it should rather be called a
+C-Blosc is not like other compressors: it should rather be called a
 meta-compressor.  This is so because it can use different compressors
-and pre-conditioners (programs that generally improve compression
-ratio).  At any rate, it can also be called a compressor because it
-happens that it already integrates one compressor and one
-pre-conditioner, so it can actually work like so.
-
-Currently it uses BloscLZ, a compressor heavily based on FastLZ
-(http://fastlz.org/), and a highly optimized (it can use SSE2
-instructions, if available) Shuffle pre-conditioner. However,
-different compressors or pre-conditioners may be added in the future.
-
-Blosc is in charge of coordinating the compressor and pre-conditioners
-so that they can leverage the blocking technique (described above) as
-well as multi-threaded execution (if several cores are available)
-automatically. That makes that every compressor and pre-conditioner
+and filters (programs that generally improve compression ratio).  At
+any rate, it can also be called a compressor because it happens that
+it already comes with several compressor and filters, so it can
+actually work like so.
+
+Currently C-Blosc comes with support of BloscLZ, a compressor heavily
+based on FastLZ (http://fastlz.org/), LZ4 and LZ4HC
+(https://github.com/Cyan4973/lz4), Snappy
+(https://github.com/google/snappy) and Zlib (http://www.zlib.net/), as
+well as a highly optimized (it can use SSE2 or AVX2 instructions, if
+available) shuffle and bitshuffle filters (for info on how and why
+shuffling works, see slide 17 of
+http://www.slideshare.net/PyData/blosc-py-data-2014).  However,
+different compressors or filters may be added in the future.
+
+C-Blosc is in charge of coordinating the different compressor and
+filters so that they can leverage the blocking technique (described
+above) as well as multi-threaded execution (if several cores are
+available) automatically. That makes that every compressor and filter
 will work at very high speeds, even if it was not initially designed
 for doing blocking or multi-threading.
 
@@ -59,85 +89,104 @@ Other advantages of Blosc are:
 
 * Meant for binary data: can take advantage of the type size
   meta-information for improved compression ratio (using the
-  integrated shuffle pre-conditioner).
-
-* Small overhead on non-compressible data: only a maximum of 16
-  additional bytes over the source buffer length are needed to
-  compress *every* input.
+  integrated shuffle and bitshuffle filters).
 
-* Maximum destination length: contrarily to many other
-  compressors, both compression and decompression routines have
-  support for maximum size lengths for the destination buffer.
+* Small overhead on non-compressible data: only a maximum of (16 + 4 *
+  nthreads) additional bytes over the source buffer length are needed
+  to compress *any kind of input*.
 
-* Replacement for memcpy(): it supports a 0 compression level that
-  does not compress at all and only adds 16 bytes of overhead. In
-  this mode Blosc can copy memory usually faster than a plain
-  memcpy().
+* Maximum destination length: contrarily to many other compressors,
+  both compression and decompression routines have support for maximum
+  size lengths for the destination buffer.
 
 When taken together, all these features set Blosc apart from other
 similar solutions.
 
-Compiling your application with Blosc
-=====================================
+Compiling your application with a minimalistic Blosc
+====================================================
 
-Blosc consists of the next files (in blosc/ directory)::
+The minimal Blosc consists of the next files (in `blosc/ directory
+<https://github.com/Blosc/c-blosc/tree/master/blosc>`_)::
 
-    blosc.h and blosc.c      -- the main routines
-    blosclz.h and blosclz.c  -- the actual compressor
-    shuffle.h and shuffle.c  -- the shuffle code
+    blosc.h and blosc.c        -- the main routines
+    shuffle*.h and shuffle*.c  -- the shuffle code
+    blosclz.h and blosclz.c    -- the blosclz compressor
 
 Just add these files to your project in order to use Blosc.  For
-information on compression and decompression routines, see blosc.h.
+information on compression and decompression routines, see `blosc.h
+<https://github.com/Blosc/c-blosc/blob/master/blosc/blosc.h>`_.
 
-To compile using GCC (4.4 or higher recommended) on Unix:
+To compile using GCC (4.9 or higher recommended) on Unix:
 
 .. code-block:: console
 
-   $ gcc -O3 -msse2 -o myprog myprog.c blosc/*.c -lpthread
+   $ gcc -O3 -mavx2 -o myprog myprog.c blosc/*.c -Iblosc -lpthread
 
 Using Windows and MINGW:
 
 .. code-block:: console
 
-   $ gcc -O3 -msse2 -o myprog myprog.c blosc\*.c
+   $ gcc -O3 -mavx2 -o myprog myprog.c -Iblosc blosc\*.c
 
-Using Windows and MSVC (2008 or higher recommended):
+Using Windows and MSVC (2013 or higher recommended):
 
 .. code-block:: console
 
-  $ cl /Ox /Femyprog.exe myprog.c blosc\*.c
+  $ cl /Ox /Femyprog.exe /Iblosc myprog.c blosc\*.c
 
-A simple usage example is the benchmark in the bench/bench.c file.
-Also, another example for using Blosc as a generic HDF5 filter is in
-the hdf5/ directory.
+In the `examples/ directory
+<https://github.com/Blosc/c-blosc/tree/master/examples>`_ you can find
+more hints on how to link your app with Blosc.
 
-I have not tried to compile this with compilers other than GCC, MINGW,
-Intel ICC or MSVC yet. Please report your experiences with your own
-platforms.
+I have not tried to compile this with compilers other than GCC, clang,
+MINGW, Intel ICC or MSVC yet. Please report your experiences with your
+own platforms.
+
+Adding support for other compressors with a minimalistic Blosc
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Testing Blosc
-=============
+The official cmake files (see below) for Blosc try hard to include
+support for LZ4, LZ4HC, Snappy, Zlib inside the Blosc library, so
+using them is just a matter of calling the appropriate
+`blosc_set_compressor() API call
+<https://github.com/Blosc/c-blosc/blob/master/blosc/blosc.h>`_.  See
+an `example here
+<https://github.com/Blosc/c-blosc/blob/master/examples/many_compressors.c>`_.
 
-Go to the test/ directory and issue:
+Having said this, it is also easy to use a minimalistic Blosc and just
+add the symbols HAVE_LZ4 (will include both LZ4 and LZ4HC),
+HAVE_SNAPPY and HAVE_ZLIB during compilation as well as the
+appropriate libraries. For example, for compiling with minimalistic
+Blosc but with added Zlib support do:
 
 .. code-block:: console
 
-  $ make test
+   $ gcc -O3 -msse2 -o myprog myprog.c blosc/*.c -Iblosc -lpthread -DHAVE_ZLIB -lz
 
-These tests are very basic, and only valid for platforms where GNU
-make/gcc tools are available.  If you really want to test Blosc the
-hard way, look at:
+In the `bench/ directory
+<https://github.com/Blosc/c-blosc/tree/master/bench>`_ there a couple
+of Makefile files (one for UNIX and the other for MinGW) with more
+complete building examples, like switching between libraries or
+internal sources for the compressors.
 
-http://blosc.org/trac/wiki/SyntheticBenchmarks
+Supported platforms
+~~~~~~~~~~~~~~~~~~~
 
-where instructions on how to intensively test (and benchmark) Blosc
-are given.  If while running these tests you get some error, please
-report it back!
+Blosc is meant to support all platforms where a C89 compliant C
+compiler can be found.  The ones that are mostly tested are Intel
+(Linux, Mac OSX and Windows) and ARM (Linux), but exotic ones as IBM
+Blue Gene Q embedded "A2" processor are reported to work too.
 
 Compiling the Blosc library with CMake
 ======================================
 
-Blosc can also be built, tested and installed using CMake_.
+Blosc can also be built, tested and installed using CMake_. Although
+this procedure might seem a bit more involved than the one described
+above, it is the most general because it allows to integrate other
+compressors than BloscLZ either from libraries or from internal
+sources. Hence, serious library developers are encouraged to use this
+way.
+
 The following procedure describes the "out of source" build.
 
 Create the build directory and move into it:
@@ -147,46 +196,109 @@ Create the build directory and move into it:
   $ mkdir build
   $ cd build
 
-Configure Blosc in release mode (enable optimizations) specifying the
-installation directory:
+Now run CMake configuration and optionally specify the installation
+directory (e.g. '/usr' or '/usr/local'):
 
 .. code-block:: console
 
-  $ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=INSTALL_DIR \
-      PATH_TO_BLOSC_SOURCE_DIR
+  $ cmake -DCMAKE_INSTALL_PREFIX=your_install_prefix_directory ..
 
-Please note that configuration can also be performed using UI tools
-provided by CMake_ (ccmake or cmake-gui):
+CMake allows to configure Blosc in many different ways, like prefering
+internal or external sources for compressors or enabling/disabling
+them.  Please note that configuration can also be performed using UI
+tools provided by CMake_ (ccmake or cmake-gui):
 
 .. code-block:: console
 
-  $ cmake-gui PATH_TO_BLOSC_SOURCE_DIR
+  $ ccmake ..      # run a curses-based interface
+  $ cmake-gui ..   # run a graphical interface
 
 Build, test and install Blosc:
 
 .. code-block:: console
 
-  $ make
-  $ make test
-  $ make install 
+  $ cmake --build .
+  $ ctest
+  $ cmake --build . --target install
 
 The static and dynamic version of the Blosc library, together with
-header files, will be installed into the specified INSTALL_DIR.
+header files, will be installed into the specified
+CMAKE_INSTALL_PREFIX.
 
 .. _CMake: http://www.cmake.org
 
+Once you have compiled your Blosc library, you can easily link your
+apps with it as shown in the `example/ directory
+<https://github.com/Blosc/c-blosc/blob/master/examples>`_.
+
+Adding support for other compressors (LZ4, LZ4HC, Snappy, Zlib) with CMake
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CMake files in Blosc are configured to automatically detect other
+compressors like LZ4, LZ4HC, Snappy or Zlib by default.  So as long as
+the libraries and the header files for these libraries are accessible,
+these will be used by default.  See an `example here
+<https://github.com/Blosc/c-blosc/blob/master/examples/many_compressors.c>`_.
+
+*Note on Zlib*: the library should be easily found on UNIX systems,
+although on Windows, you can help CMake to find it by setting the
+environment variable 'ZLIB_ROOT' to where zlib 'include' and 'lib'
+directories are. Also, make sure that Zlib DDL library is in your
+'\Windows' directory.
+
+However, the full sources for LZ4, LZ4HC, Snappy and Zlib have been
+included in Blosc too. So, in general, you should not worry about not
+having (or CMake not finding) the libraries in your system because in
+this case, their sources will be automatically compiled for you. That
+effectively means that you can be confident in having a complete
+support for all the supported compression libraries in all supported
+platforms.
+
+If you want to force Blosc to use external libraries instead of
+the included compression sources:
+
+.. code-block:: console
+
+  $ cmake -DPREFER_EXTERNAL_LZ4=ON ..
+
+You can also disable support for some compression libraries:
+
+.. code-block:: console
+
+  $ cmake -DDEACTIVATE_SNAPPY=ON ..
+
+Mac OSX troubleshooting
+~~~~~~~~~~~~~~~~~~~~~~~
+
+If you run into compilation troubles when using Mac OSX, please make
+sure that you have installed the command line developer tools.  You
+can always install them with:
+
+.. code-block:: console
+
+  $ xcode-select --install
+
 Wrapper for Python
 ==================
 
 Blosc has an official wrapper for Python.  See:
 
-https://github.com/FrancescAlted/python-blosc
+https://github.com/Blosc/python-blosc
+
+Command line interface and serialization format for Blosc
+=========================================================
+
+Blosc can be used from command line by using Bloscpack.  See:
+
+https://github.com/Blosc/bloscpack
 
 Filter for HDF5
 ===============
 
-For those that want to use Blosc as a filter in the HDF5 library,
-there is a sample implementation in the hdf5/ directory.
+For those who want to use Blosc as a filter in the HDF5 library,
+there is a sample implementation in the blosc/hdf5 project in:
+
+https://github.com/Blosc/hdf5
 
 Mailing list
 ============
@@ -199,21 +311,7 @@ http://groups.google.es/group/blosc
 Acknowledgments
 ===============
 
-I'd like to thank the PyTables community that have collaborated in the
-exhaustive testing of Blosc.  With an aggregate amount of more than 300 TB of
-different datasets compressed *and* decompressed successfully, I can say that
-Blosc is pretty safe now and ready for production purposes.
-
-Other important contributions:
-
-* Thibault North contributed a way to call Blosc from different threads in a
-  safe way.
-
-* The cmake support was a contribution of Thibault North, Antonio Valentino
-  and Mark Wiebe.
-
-* Valentin Haenel did a terrific work fixing typos and improving docs and the
-  plotting script.
+See THANKS.rst.
 
 
 ----
diff --git a/thirdparty/blosc/README_HEADER.rst b/thirdparty/blosc/README_HEADER.rst
index ec50c887f8a6f61d009baa9638cdd5366790eb97..81b5f83cd9791311e7a7365927ead18704145810 100644
--- a/thirdparty/blosc/README_HEADER.rst
+++ b/thirdparty/blosc/README_HEADER.rst
@@ -20,14 +20,40 @@ All entries are little endian.
 :version:
     (``uint8``) Blosc format version.
 :versionlz:
-    (``uint8``) Blosclz format  version (internal Lempel-Ziv algorithm).
-:flags:
-    (``bitfield``) The flags of the buffer.
+    (``uint8``) Version of the internal compressor used.
+:flags and compressor enumeration:
+    (``bitfield``) The flags of the buffer
 
     :bit 0 (``0x01``):
-        Whether the shuffle filter has been applied or not.
+        Whether the byte-shuffle filter has been applied or not.
     :bit 1 (``0x02``):
         Whether the internal buffer is a pure memcpy or not.
+    :bit 2 (``0x04``):
+        Whether the bit-shuffle filter has been applied or not.
+    :bit 3 (``0x08``):
+        Reserved
+    :bit 4 (``0x16``):
+        Reserved
+    :bit 5 (``0x32``):
+        Part of the enumeration for compressors.
+    :bit 6 (``0x64``):
+        Part of the enumeration for compressors.
+    :bit 7 (``0x64``):
+        Part of the enumeration for compressors.
+
+    The last three bits form an enumeration that allows to use alternative
+    compressors.
+
+    :``0``:
+        ``blosclz``
+    :``1``:
+        ``lz4`` or ``lz4hc``
+    :``2``:
+        ``snappy``
+    :``3``:
+        ``zlib``
+    :``4``:
+        ``zstd``
 
 :typesize:
     (``uint8``) Number of bytes for the atomic type.
@@ -37,4 +63,3 @@ All entries are little endian.
     (``uint32``) Size of internal blocks.
 :ctbytes:
     (``uint32``) Compressed size of the buffer.
-
diff --git a/thirdparty/blosc/RELEASE_NOTES.rst b/thirdparty/blosc/RELEASE_NOTES.rst
index aa39ef1fb526fc937af9f421e03dc40d6957255d..00c8ea0a0043f803037ea4ef134dda3731547807 100644
--- a/thirdparty/blosc/RELEASE_NOTES.rst
+++ b/thirdparty/blosc/RELEASE_NOTES.rst
@@ -1,12 +1,479 @@
-===============================
- Release notes for Blosc 1.2.3
-===============================
+===========================
+ Release notes for C-Blosc
+===========================
 
 :Author: Francesc Alted
-:Contact: faltet@gmail.com
+:Contact: francesc@blosc.org
 :URL: http://www.blosc.org
 
 
+Changes from 1.10.0 to 1.10.1
+=============================
+
+ #XXX version-specific blurb XXX#
+
+
+Changes from 1.9.3 to 1.10.0
+============================
+
+- Initial support for Zstandard (0.7.4). Zstandard (or Zstd for short) is a new
+  compression library that allows better compression than Zlib, but that works
+  typically faster (and some times much faster), making of it a good match for
+  Blosc.
+
+  Although the Zstd format is considered stable
+  (http://fastcompression.blogspot.com.es/2016_07_03_archive.html), its API is
+  maturing very fast, and despite passing the extreme test suite for C-Blosc,
+  this codec should be considered in beta for C-Blosc usage purposes. Please
+  test it and report back any possible issues you may get.
+
+
+Changes from 1.9.2 to 1.9.3
+===========================
+
+- Reverted a mistake introduced in 1.7.1.  At that time, bit-shuffling
+  was enabled for typesize == 1 (i.e. strings), but the change also
+  included byte-shuffling accidentally.  This only affected performance,
+  but in a quite bad way (a copy was needed).  This has been fixed and
+  byte-shuffling is not active when typesize == 1 anymore.
+
+
+Changes from 1.9.1 to 1.9.2
+===========================
+
+- Check whether Blosc is actually initialized before blosc_init(),
+  blosc_destroy() and blosc_free_resources().  This makes the library
+  more resistant to different initialization cycles
+  (e.g. https://github.com/stevengj/Blosc.jl/issues/19).
+
+
+Changes from 1.9.0 to 1.9.1
+===========================
+
+- The internal copies when clevel=0 are made now via memcpy().  At the
+  beginning of C-Blosc development, benchmarks where saying that the
+  internal, multi-threaded copies inside C-Blosc were faster than
+  memcpy(), but 6 years later, memcpy() made greats strides in terms
+  of efficiency.  With this, you should expect an slight speed
+  advantage (10% ~ 20%) when C-Blosc is used as a replacement of
+  memcpy() (which should not be the most common scenario out there).
+
+- Added a new DEACTIVATE_AVX2 cmake option to explicitly disable AVX2
+  at build-time.  Thanks to James Bird.
+
+- The ``make -jN`` for parallel compilation should work now.  Thanks
+  to James Bird.
+
+
+Changes from 1.8.1 to 1.9.0
+===========================
+
+* New blosc_get_nthreads() function to get the number of threads that
+  will be used internally during compression/decompression (set by
+  already existing blosc_set_nthreads()).
+
+* New blosc_get_compressor() function to get the compressor that will
+  be used internally during compression (set by already existing
+  blosc_set_compressor()).
+
+* New blosc_get_blocksize() function to get the internal blocksize to
+  be used during compression (set by already existing
+  blosc_set_blocksize()).
+
+* Now, when the BLOSC_NOLOCK environment variable is set (to any
+  value), the calls to blosc_compress() and blosc_decompress() will
+  call blosc_compress_ctx() and blosc_decompress_ctx() under the hood
+  so as to avoid the internal locks.  See blosc.h for details.  This
+  allows multi-threaded apps calling the non _ctx() functions to avoid
+  the internal locks in C-Blosc.  For the not multi-threaded app
+  though, it is in general slower to call the _ctx() functions so the
+  use of BLOSC_NOLOCK is discouraged.
+
+* In the same vein, from now on, when the BLOSC_NTHREADS environment
+  variable is set to an integer, every call to blosc_compress() and
+  blosc_decompress() will call blosc_set_nthreads(BLOSC_NTHREADS)
+  before the actuall compression/decompression process.  See blosc.h
+  for details.
+
+* Finally, if BLOSC_CLEVEL, BLOSC_SHUFFLE, BLOSC_TYPESIZE and/or
+  BLOSC_COMPRESSOR variables are set in the environment, these will be
+  also honored before calling blosc_compress().
+
+* Calling blosc_init() before any other Blosc call, although
+  recommended, is not necessary anymore.  The idea is that you can use
+  just the basic blosc_compress() and blosc_decompress() and control
+  other parameters (nthreads, compressor, blocksize) by using
+  environment variables (see above).
+
+
+Changes from 1.8.0 to 1.8.1
+===========================
+
+* Disable the use of __builtin_cpu_supports() for GCC 5.3.1
+  compatibility.  Details in:
+  https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/
+
+
+Changes from 1.7.1 to 1.8.0
+===========================
+
+* The code is (again) compatible with VS2008 and VS2010.  This is
+  important for compatibility with Python 2.6/2.7/3.3/3.4.
+
+* Introduced a new global lock during blosc_decompress() operation.
+  As the blosc_compress() was already guarded by a global lock, this
+  means that the compression/decompression is again thread safe.
+  However, when using C-Blosc from multi-threaded environments, it is
+  important to keep using the *_ctx() functions for performance
+  reasons.  NOTE: _ctx() functions will be replaced by more powerful
+  ones in C-Blosc 2.0.
+
+
+Changes from 1.7.0 to 1.7.1
+===========================
+
+* Fixed a bug preventing bitshuffle to work correctly on getitem().
+  Now, everything with bitshuffle seems to work correctly.
+
+* Fixed the thread initialization for blosc_decompress_ctx().  Issue
+  #158.  Thanks to Chris Webers.
+
+* Fixed a bug in the blocksize computation introduced in 1.7.0.  This
+  could have been creating segfaults.
+
+* Allow bitshuffle to run on 1-byte typesizes.
+
+* New parametrization of the blocksize to be independent of the
+  typesize.  This allows a smoother speed throughout all typesizes.
+
+* lz4 and lz4hc codecs upgraded to 1.7.2 (from 1.7.0).
+
+* When calling set_nthreads() but not actually changing the number of
+  threads in the internal pool does not teardown and setup it anymore.
+  PR #153.  Thanks to Santi Villalba.
+
+
+Changes from 1.6.1 to 1.7.0
+===========================
+
+* Added a new 'bitshuffle' filter so that the shuffle takes place at a
+  bit level and not just at a byte one, which is what it does the
+  previous 'shuffle' filter.
+
+  For activating this new bit-level filter you only have to pass the
+  symbol BLOSC_BITSHUFFLE to `blosc_compress()`.  For the previous
+  byte-level one, pass BLOSC_SHUFFLE.  For disabling the shuffle, pass
+  BLOSC_NOSHUFFLE.
+
+  This is a port of the existing filter in
+  https://github.com/kiyo-masui/bitshuffle.  Thanks to Kiyo Masui for
+  changing the license and allowing its inclusion here.
+
+* New acceleration mode for LZ4 and BloscLZ codecs that enters in
+  operation with complevel < 9.  This allows for an important boost in
+  speed with minimal compression ratio loss.  Francesc Alted.
+
+* LZ4 codec updated to 1.7.0 (r130).
+
+* PREFER_EXTERNAL_COMPLIBS cmake option has been removed and replaced
+  by the more fine grained PREFER_EXTERNAL_LZ4, PREFER_EXTERNAL_SNAPPY
+  and PREFER_EXTERNAL_ZLIB.  In order to allow the use of the new API
+  introduced in LZ4 1.7.0, PREFER_EXTERNAL_LZ4 has been set to OFF by
+  default, whereas PREFER_EXTERNAL_SNAPPY and PREFER_EXTERNAL_ZLIB
+  continues to be ON.
+
+* Implemented SSE2 shuffle support for buffers containing a number of
+  elements which is not a multiple of (typesize * vectorsize).  Jack
+  Pappas.
+
+* Added SSE2 shuffle/unshuffle routines for types larger than 16
+  bytes.  Jack Pappas.
+
+* 'test_basic' suite has been split in components for a much better
+  granularity on what's a possibly failing test.  Also, lots of new
+  tests have been added.  Jack Pappas.
+
+* Fixed compilation on non-Intel archs (tested on ARM).  Zbyszek
+  Szmek.
+
+* Modifyied cmake files in order to inform that AVX2 on Visual Studio
+  is supported only in 2013 update 2 and higher.
+
+* Added a replacement for stdbool.h for Visual Studio < 2013.
+
+* blosclz codec adds Win64/Intel as a platform supporting unaligned
+  addressing.  That leads to a speed-up of 2.2x in decompression.
+
+* New blosc_get_version_string() function for retrieving the version
+  of the c-blosc library.  Useful when linking with dynamic libraries
+  and one want to know its version.
+
+* New example (win-dynamic-linking.c) that shows how to link a Blosc
+  DLL dynamically in run-time (Windows only).
+
+* The `context.threads_started` is initialized now when decompressing.
+  This could cause crashes in case you decompressed before compressing
+  (e.g. directly deserializing blosc buffers).  @atchouprakov.
+
+* The HDF5 filter has been removed from c-blosc and moved into its own
+  repo at: https://github.com/Blosc/hdf5
+
+* The MS Visual Studio 2008 has been tested with c-blosc for ensuring
+  compatibility with extensions for Python 2.6 and up.
+
+
+Changes from 1.6.0 to 1.6.1
+===========================
+
+* Support for *runtime* detection of AVX2 and SSE2 SIMD instructions.
+  These changes make it possible to compile one single binary that
+  runs on a system that supports SSE2 or AVX2 (or neither), so the
+  redistribution problem is fixed (see #101).  Thanks to Julian Taylor
+  and Jack Pappas.
+
+* Added support for MinGW and TDM-GCC compilers for Windows.  Thanks
+  to yasushima-gd.
+
+* Fixed a bug in blosclz that could potentially overwrite an area
+  beyond the output buffer.  See #113.
+
+* New computation for blocksize so that larger typesizes (> 8 bytes)
+  would benefit of much better compression ratios.  Speed is not
+  penalized too much.
+
+* New parametrization of the hash table for blosclz codec.  This
+  allows better compression in many scenarios, while slightly
+  increasing the speed.
+
+
+Changes from 1.5.4 to 1.6.0
+===========================
+
+* Support for AVX2 is here!  The benchmarks with a 4-core Intel
+  Haswell machine tell that both compression and decompression are
+  accelerated around a 10%, reaching peaks of 9.6 GB/s during
+  compression and 26 GB/s during decompression (memcpy() speed for
+  this machine is 7.5 GB/s for writes and 11.7 GB/s for reads).  Many
+  thanks to @littlezhou for this nice work.
+
+* Support for HPET (high precision timers) for the `bench` program.
+  This is particularly important for microbenchmarks like bench is
+  doing; since they take so little time to run, the granularity of a
+  less-accurate timer may account for a significant portion of the
+  runtime of the benchmark itself, skewing the results.  Thanks to
+  Jack Pappas.
+
+
+Changes from 1.5.3 to 1.5.4
+===========================
+
+* Updated to LZ4 1.6.0 (r128).
+
+* Fix resource leak in t_blosc.  Jack Pappas.
+
+* Better checks during testing.  Jack Pappas.
+
+* Dynamically loadable HDF5 filter plugin. Kiyo Masui.
+
+
+Changes from 1.5.2 to 1.5.3
+===========================
+
+* Use llabs function (where available) instead of abs to avoid
+  truncating the result.  Jack Pappas.
+
+* Use C11 aligned_alloc when it's available.  Jack Pappas.
+
+* Use the built-in stdint.h with MSVC when available.  Jack Pappas.
+
+* Only define the __SSE2__ symbol when compiling with MS Visual C++
+  and targeting x64 or x86 with the correct /arch flag set. This
+  avoids re-defining the symbol which makes other compilers issue
+  warnings.  Jack Pappas.
+
+* Reinitializing Blosc during a call to set_nthreads() so as to fix
+  problems with contexts.  Francesc Alted.
+
+
+
+Changes from 1.5.1 to 1.5.2
+===========================
+
+* Using blosc_compress_ctx() / blosc_decompress_ctx() inside the HDF5
+  compressor for allowing operation in multiprocess scenarios.  See:
+  https://github.com/PyTables/PyTables/issues/412
+
+  The drawback of this quick fix is that the Blosc filter will be only
+  able to use a single thread until another solution can be devised.
+
+
+Changes from 1.5.0 to 1.5.1
+===========================
+
+* Updated to LZ4 1.5.0.  Closes #74.
+
+* Added the 'const' qualifier to non SSE2 shuffle functions. Closes #75.
+
+* Explicitly call blosc_init() in HDF5 blosc_filter.c, fixing a
+  segfault.
+
+* Quite a few improvements in cmake files for HDF5 support.  Thanks to
+  Dana Robinson (The HDF Group).
+
+* Variable 'class' caused problems compiling the HDF5 filter with g++.
+  Thanks to Laurent Chapon.
+
+* Small improvements on docstrings of c-blosc main functions.
+
+
+Changes from 1.4.1 to 1.5.0
+===========================
+
+* Added new calls for allowing Blosc to be used *simultaneously*
+  (i.e. lock free) from multi-threaded environments.  The new
+  functions are:
+
+  - blosc_compress_ctx(...)
+  - blosc_decompress_ctx(...)
+
+  See the new docstrings in blosc.h for how to use them.  The previous
+  API should be completely unaffected.  Thanks to Christopher Speller.
+
+* Optimized copies during BloscLZ decompression.  This can make BloscLZ
+  to decompress up to 1.5x faster in some situations.
+
+* LZ4 and LZ4HC compressors updated to version 1.3.1.
+
+* Added an examples directory on how to link apps with Blosc.
+
+* stdlib.h moved from blosc.c to blosc.h as suggested by Rob Lathm.
+
+* Fix a warning for {snappy,lz4}-free compilation.  Thanks to Andrew Schaaf.
+
+* Several improvements for CMakeLists.txt (cmake).
+
+* Fixing C99 compatibility warnings.  Thanks to Christopher Speller.
+
+
+Changes from 1.4.0 to 1.4.1
+===========================
+
+* Fixed a bug in blosc_getitem() introduced in 1.4.0.  Added a test for
+  blosc_getitem() as well.
+
+
+Changes from 1.3.6 to 1.4.0
+===========================
+
+* Support for non-Intel and non-SSE2 architectures has been added.  In
+  particular, the Raspberry Pi platform (ARM) has been tested and all
+  tests pass here.
+
+* Architectures requiring strict access alignment are supported as well.
+  Due to this, arquitectures with a high penalty in accessing unaligned
+  data (e.g. Raspberry Pi, ARMv6) can compress up to 2.5x faster.
+
+* LZ4 has been updated to r119 (1.2.0) so as to fix a possible security
+  breach.
+
+
+Changes from 1.3.5 to 1.3.6
+===========================
+
+* Updated to LZ4 r118 due to a (highly unlikely) security hole.  For
+  details see:
+
+  http://fastcompression.blogspot.fr/2014/06/debunking-lz4-20-years-old-bug-myth.html
+
+
+Changes from 1.3.4 to 1.3.5
+===========================
+
+* Removed a pointer from 'pointer from integer without a cast' compiler
+  warning due to a bad macro definition.
+
+
+Changes from 1.3.3 to 1.3.4
+===========================
+
+* Fixed a false buffer overrun condition.  This bug made c-blosc to
+  fail, even if the failure was not real.
+
+* Fixed the type of a buffer string.
+
+
+Changes from 1.3.2 to 1.3.3
+===========================
+
+* Updated to LZ4 1.1.3 (improved speed for 32-bit platforms).
+
+* Added a new `blosc_cbuffer_complib()` for getting the compression
+  library for a compressed buffer.
+
+
+Changes from 1.3.1 to 1.3.2
+===========================
+
+* Fix for compiling Snappy sources against MSVC 2008.  Thanks to Mark
+  Wiebe!
+
+* Version for internal LZ4 and Snappy are now supported.  When compiled
+  against the external libraries, this info is not available because
+  they do not support the symbols (yet).
+
+
+Changes from 1.3.0 to 1.3.1
+===========================
+
+* Fixes for a series of issues with the filter for HDF5 and, in
+  particular, a problem in the decompression buffer size that made it
+  impossible to use the blosc_filter in combination with other ones
+  (e.g. fletcher32).  See
+  https://github.com/PyTables/PyTables/issues/21.
+
+  Thanks to Antonio Valentino for the fix!
+
+
+Changes from 1.2.4 to 1.3.0
+===========================
+
+A nice handful of compressors have been added to Blosc:
+
+* LZ4 (http://code.google.com/p/lz4/): A very fast
+  compressor/decompressor.  Could be thought as a replacement of the
+  original BloscLZ, but it can behave better is some scenarios.
+
+* LZ4HC (http://code.google.com/p/lz4/): This is a variation of LZ4
+  that achieves much better compression ratio at the cost of being
+  much slower for compressing.  Decompression speed is unaffected (and
+  sometimes better than when using LZ4 itself!), so this is very good
+  for read-only datasets.
+
+* Snappy (http://code.google.com/p/snappy/): A very fast
+  compressor/decompressor.  Could be thought as a replacement of the
+  original BloscLZ, but it can behave better is some scenarios.
+
+* Zlib (http://www.zlib.net/): This is a classic.  It achieves very
+  good compression ratios, at the cost of speed.  However,
+  decompression speed is still pretty good, so it is a good candidate
+  for read-only datasets.
+
+With this, you can select the compression library with the new
+function::
+
+  int blosc_set_complib(char* complib);
+
+where you pass the library that you want to use (currently "blosclz",
+"lz4", "lz4hc", "snappy" and "zlib", but the list can grow in the
+future).
+
+You can get more info about compressors support in you Blosc build by
+using these functions::
+
+  char* blosc_list_compressors(void);
+  int blosc_get_complib_info(char *compressor, char **complib, char **version);
+
+
 Changes from 1.2.2 to 1.2.3
 ===========================
 
@@ -244,12 +711,3 @@ Changes from 0.8.0 to 0.9
   MacOSX (for example, Tiger).  At nay rate, posix_memalign() is not
   necessary on Mac because 16 bytes alignment is ensured by default.
   Thanks to Ivan Vilata.  Fixes #3.
-
-
-
-
-.. Local Variables:
-.. mode: rst
-.. coding: utf-8
-.. fill-column: 72
-.. End:
diff --git a/thirdparty/blosc/RELEASING.rst b/thirdparty/blosc/RELEASING.rst
index dd00dd63c29946f1742a8b5122d6bfd0232ba90a..4d6633b8ce098a7261fa5a0fc9488adb3f44dd8f 100644
--- a/thirdparty/blosc/RELEASING.rst
+++ b/thirdparty/blosc/RELEASING.rst
@@ -3,8 +3,8 @@ Releasing Blosc
 ================
 
 :Author: Francesc Alted
-:Contact: faltet@gmail.com
-:Date: 2012-09-16
+:Contact: francesc@blosc.org
+:Date: 2014-01-15
 
 
 Preliminaries
@@ -15,46 +15,27 @@ Preliminaries
 
 - Check that *VERSION* symbols in blosc/blosc.h contains the correct info.
 
+- Commit the changes::
+
+    $ git commit -a -m"Getting ready for X.Y.Z release"
+
+
 Testing
 -------
 
-Go to the test/ directory and issue::
+Create a new build/ directory, change into it and issue::
 
-  $ make test
+  $ cmake ..
+  $ cmake --build .
+  $ ctest
 
-These tests are very basic, and only valid for platforms where GNU
-make/gcc tools are available.  To actually test Blosc the hard way,
-look at the end of:
+To actually test Blosc the hard way, look at the end of:
 
-http://blosc.org/trac/wiki/SyntheticBenchmarks
+http://blosc.org/synthetic-benchmarks.html
 
 where instructions on how to intensively test (and benchmark) Blosc
 are given.
 
-Packaging
----------
-
-- Unpack the archive of the repository in a temporary directory::
-
-  $ export VERSION="the version number"
-  $ mkdir /tmp/blosc-$VERSION
-  # IMPORTANT: make sure that you are at the root of the repo now!
-  $ git archive master | tar -x -C /tmp/blosc-$VERSION
-
-- And package the repo::
-
-  $ cd /tmp
-  $ tar cvfz blosc-$VERSION.tar.gz blosc-$VERSION
-
-Do a quick check that the tarball is sane.
-
-
-Uploading
----------
-
-- Go to the downloads section in blosc.org and upload the source
-  tarball.
-
 
 Tagging
 -------
@@ -71,14 +52,11 @@ Tagging
 Announcing
 ----------
 
-- Update the release notes in the github wiki:
-
-https://github.com/FrancescAlted/blosc/wiki/Release-notes
-
-- Send an announcement to the blosc, pytables, carray and
+- Send an announcement to the blosc, pytables-dev, bcolz and
   comp.compression lists.  Use the ``ANNOUNCE.rst`` file as skeleton
   (possibly as the definitive version).
 
+
 Post-release actions
 --------------------
 
@@ -86,11 +64,15 @@ Post-release actions
   version to the next minor one (i.e. X.Y.Z --> X.Y.(Z+1).dev).
 
 - Create new headers for adding new features in ``RELEASE_NOTES.rst``
-  and empty the release-specific information in ``ANNOUNCE.rst`` and
-  add this place-holder instead:
+  and add this place-holder instead:
 
   #XXX version-specific blurb XXX#
 
+- Commit the changes::
+
+    $ git commit -a -m"Post X.Y.Z release actions done"
+    $ git push
+
 
 That's all folks!
 
diff --git a/thirdparty/blosc/THANKS.rst b/thirdparty/blosc/THANKS.rst
new file mode 100644
index 0000000000000000000000000000000000000000..548862a92b6786f599c7f425f9aff0689596b5c4
--- /dev/null
+++ b/thirdparty/blosc/THANKS.rst
@@ -0,0 +1,35 @@
+I'd like to thank the PyTables community that have collaborated in the
+exhaustive testing of Blosc.  With an aggregate amount of more than
+300 TB of different datasets compressed *and* decompressed
+successfully, I can say that Blosc is pretty safe now and ready for
+production purposes.
+
+Other important contributions:
+
+* Valentin Haenel did a terrific work implementing the support for the
+  Snappy compression, fixing typos and improving docs and the plotting
+  script.
+
+* Thibault North, with ideas from Oscar Villellas, contributed a way
+  to call Blosc from different threads in a safe way.  Christopher
+  Speller introduced contexts so that a global lock is not necessary
+  anymore.
+
+* The CMake support was initially contributed by Thibault North, and
+  Antonio Valentino and Mark Wiebe made great enhancements to it.
+
+* Christopher Speller also introduced the two new '_ctx' calls to
+  avoid the use of the blosc_init() and blosc_destroy().
+
+* Jack Pappas contributed important portability enhancements,
+  specially runtime and cross-platform detection of SSE2/AVX2 as well
+  as high precision timers (HPET) for the benchmark program.
+
+* @littlezhou implemented the AVX2 version of shuffle routines.
+
+* Julian Taylor contributed a way to detect AVX2 in runtime and
+  calling the appropriate routines only if the undelying hardware
+  supports it.
+
+* Kiyo Masui for relicensing his bitshuffle project for allowing the
+  inclusion of part of his code in Blosc.
diff --git a/thirdparty/blosc/bitshuffle-generic.c b/thirdparty/blosc/bitshuffle-generic.c
new file mode 100644
index 0000000000000000000000000000000000000000..589803f60b6ee538b7a359511098b8269fbecdfa
--- /dev/null
+++ b/thirdparty/blosc/bitshuffle-generic.c
@@ -0,0 +1,197 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#include "bitshuffle-generic.h"
+
+
+/* Transpose bytes within elements, starting partway through input. */
+int64_t bshuf_trans_byte_elem_remainder(void* in, void* out, const size_t size,
+         const size_t elem_size, const size_t start) {
+
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+    size_t ii, jj, kk;
+
+    CHECK_MULT_EIGHT(start);
+
+    if (size > start) {
+        /*  ii loop separated into 2 loops so the compiler can unroll */
+        /*  the inner one. */
+        for (ii = start; ii + 7 < size; ii += 8) {
+            for (jj = 0; jj < elem_size; jj++) {
+                for (kk = 0; kk < 8; kk++) {
+                    out_b[jj * size + ii + kk]
+                        = in_b[ii * elem_size + kk * elem_size + jj];
+                }
+            }
+        }
+        for (ii = size - size % 8; ii < size; ii ++) {
+            for (jj = 0; jj < elem_size; jj++) {
+                out_b[jj * size + ii] = in_b[ii * elem_size + jj];
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Transpose bytes within elements. */
+int64_t bshuf_trans_byte_elem_scal(void* in, void* out, const size_t size,
+				   const size_t elem_size) {
+
+    return bshuf_trans_byte_elem_remainder(in, out, size, elem_size, 0);
+}
+
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_remainder(void* in, void* out, const size_t size,
+         const size_t elem_size, const size_t start_byte) {
+
+    int64_t* in_b = in;
+    int8_t* out_b = out;
+
+    int64_t x, t;
+
+    size_t nbyte = elem_size * size;
+    size_t nbyte_bitrow = nbyte / 8;
+    size_t ii;
+    int kk;
+
+    CHECK_MULT_EIGHT(nbyte);
+    CHECK_MULT_EIGHT(start_byte);
+
+    for (ii = start_byte / 8; ii < nbyte_bitrow; ii ++) {
+        x = in_b[ii];
+        TRANS_BIT_8X8(x, t);
+        for (kk = 0; kk < 8; kk ++) {
+            out_b[kk * nbyte_bitrow + ii] = x;
+            x = x >> 8;
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_scal(void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    return bshuf_trans_bit_byte_remainder(in, out, size, elem_size, 0);
+}
+
+
+/* General transpose of an array, optimized for large element sizes. */
+int64_t bshuf_trans_elem(void* in, void* out, const size_t lda,
+        const size_t ldb, const size_t elem_size) {
+
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+    size_t ii, jj;
+    for (ii = 0; ii < lda; ii++) {
+        for (jj = 0; jj < ldb; jj++) {
+            memcpy(&out_b[(jj*lda + ii) * elem_size],
+                   &in_b[(ii*ldb + jj) * elem_size], elem_size);
+        }
+    }
+    return lda * ldb * elem_size;
+}
+
+
+/* Transpose rows of shuffled bits (size / 8 bytes) within groups of 8. */
+int64_t bshuf_trans_bitrow_eight(void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    size_t nbyte_bitrow = size / 8;
+
+    CHECK_MULT_EIGHT(size);
+
+    return bshuf_trans_elem(in, out, 8, elem_size, nbyte_bitrow);
+}
+
+
+/* Transpose bits within elements. */
+int64_t bshuf_trans_bit_elem_scal(void* in, void* out, const size_t size,
+                                  const size_t elem_size, void* tmp_buf) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
+    CHECK_ERR(count);
+    count = bshuf_trans_bit_byte_scal(out, tmp_buf, size, elem_size);
+    CHECK_ERR(count);
+    count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
+
+    return count;
+}
+
+
+/* For data organized into a row for each bit (8 * elem_size rows), transpose
+ * the bytes. */
+int64_t bshuf_trans_byte_bitrow_scal(void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+
+    size_t nbyte_row = size / 8;
+    size_t ii, jj, kk;
+
+    CHECK_MULT_EIGHT(size);
+
+    for (jj = 0; jj < elem_size; jj++) {
+        for (ii = 0; ii < nbyte_row; ii++) {
+            for (kk = 0; kk < 8; kk++) {
+                out_b[ii * 8 * elem_size + jj * 8 + kk] = \
+                        in_b[(jj * 8 + kk) * nbyte_row + ii];
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Shuffle bits within the bytes of eight element blocks. */
+int64_t bshuf_shuffle_bit_eightelem_scal(void* in, void* out,
+        const size_t size, const size_t elem_size) {
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+    size_t nbyte = elem_size * size;
+    int64_t x, t;
+    size_t jj, ii, kk;
+
+    CHECK_MULT_EIGHT(size);
+
+    for (jj = 0; jj < 8 * elem_size; jj += 8) {
+        for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) {
+            x = *((int64_t*) &in_b[ii + jj]);
+            TRANS_BIT_8X8(x, t);
+            for (kk = 0; kk < 8; kk++) {
+                *((uint8_t*) &out_b[ii + jj / 8 + kk * elem_size]) = x;
+                x = x >> 8;
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Untranspose bits within elements. */
+int64_t bshuf_untrans_bit_elem_scal(void* in, void* out, const size_t size,
+                                    const size_t elem_size, void* tmp_buf) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    count = bshuf_trans_byte_bitrow_scal(in, tmp_buf, size, elem_size);
+    CHECK_ERR(count);
+    count =  bshuf_shuffle_bit_eightelem_scal(tmp_buf, out, size, elem_size);
+
+    return count;
+}
diff --git a/thirdparty/blosc/bitshuffle-generic.h b/thirdparty/blosc/bitshuffle-generic.h
new file mode 100644
index 0000000000000000000000000000000000000000..03b3f55473c533837ae327c825ba3ea82138b213
--- /dev/null
+++ b/thirdparty/blosc/bitshuffle-generic.h
@@ -0,0 +1,151 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+/* Generic (non-hardware-accelerated) shuffle/unshuffle routines.
+   These are used when hardware-accelerated functions aren't available
+   for a particular platform; they are also used by the hardware-
+   accelerated functions to handle any remaining elements in a block
+   which isn't a multiple of the hardware's vector size. */
+
+#ifndef BITSHUFFLE_GENERIC_H
+#define BITSHUFFLE_GENERIC_H
+
+#include "shuffle-common.h"
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*  Macros. */
+#define CHECK_MULT_EIGHT(n) if (n % 8) return -80;
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
+#define CHECK_ERR(count) if (count < 0) { return count; }
+
+
+/* ---- Worker code not requiring special instruction sets. ----
+ *
+ * The following code does not use any x86 specific vectorized instructions
+ * and should compile on any machine
+ *
+ */
+
+/* Transpose 8x8 bit array packed into a single quadword *x*.
+ * *t* is workspace. */
+#define TRANS_BIT_8X8(x, t) {                                               \
+        t = (x ^ (x >> 7)) & 0x00AA00AA00AA00AALL;                          \
+        x = x ^ t ^ (t << 7);                                               \
+        t = (x ^ (x >> 14)) & 0x0000CCCC0000CCCCLL;                         \
+        x = x ^ t ^ (t << 14);                                              \
+        t = (x ^ (x >> 28)) & 0x00000000F0F0F0F0LL;                         \
+        x = x ^ t ^ (t << 28);                                              \
+    }
+
+
+/* Transpose of an array of arbitrarily typed elements. */
+#define TRANS_ELEM_TYPE(in, out, lda, ldb, type_t) {                        \
+        type_t* in_type = (type_t*) in;                                     \
+        type_t* out_type = (type_t*) out;                                   \
+        size_t ii, jj, kk;                                                  \
+        for (ii = 0; ii + 7 < lda; ii += 8) {                               \
+            for (jj = 0; jj < ldb; jj++) {                                  \
+                for (kk = 0; kk < 8; kk++) {                                \
+                    out_type[jj*lda + ii + kk] =                            \
+                        in_type[ii*ldb + kk * ldb + jj];                    \
+                }                                                           \
+            }                                                               \
+        }                                                                   \
+        for (ii = lda - lda % 8; ii < lda; ii ++) {                         \
+            for (jj = 0; jj < ldb; jj++) {                                  \
+                out_type[jj*lda + ii] = in_type[ii*ldb + jj];               \
+            }                                                               \
+        }                                                                   \
+    }
+
+
+/* Private functions */
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_byte_elem_remainder(void* in, void* out, const size_t size,
+                                const size_t elem_size, const size_t start);
+
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_byte_elem_scal(void* in, void* out, const size_t size,
+                           const size_t elem_size);
+
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_bit_byte_remainder(void* in, void* out, const size_t size,
+                               const size_t elem_size, const size_t start_byte);
+
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_elem(void* in, void* out, const size_t lda,
+                 const size_t ldb, const size_t elem_size);
+
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_bitrow_eight(void* in, void* out, const size_t size,
+                         const size_t elem_size);
+
+BLOSC_NO_EXPORT int64_t
+bshuf_shuffle_bit_eightelem_scal(void* in, void* out,
+                                 const size_t size, const size_t elem_size);
+
+
+/* Bitshuffle the data.
+ *
+ * Transpose the bits within elements.
+ *
+ * Parameters
+ * ----------
+ *  in : input buffer, must be of size * elem_size bytes
+ *  out : output buffer, must be of size * elem_size bytes
+ *  size : number of elements in input
+ *  elem_size : element size of typed data
+ *  tmp_buffer : temporary buffer with the same `size` than `in` and `out`
+ *
+ * Returns
+ * -------
+ *  nothing -- this cannot fail
+ *
+ */
+
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_bit_elem_scal(void* in, void* out, const size_t size,
+                          const size_t elem_size, void* tmp_buf);
+
+/* Unshuffle bitshuffled data.
+ *
+ * Untranspose the bits within elements.
+ *
+ * To properly unshuffle bitshuffled data, *size* and *elem_size* must
+ * match the parameters used to shuffle the data.
+ *
+ * Parameters
+ * ----------
+ *  in : input buffer, must be of size * elem_size bytes
+ *  out : output buffer, must be of size * elem_size bytes
+ *  size : number of elements in input
+ *  elem_size : element size of typed data
+ *  tmp_buffer : temporary buffer with the same `size` than `in` and `out`
+ *
+ * Returns
+ * -------
+ *  nothing -- this cannot fail
+ *
+ */
+
+BLOSC_NO_EXPORT int64_t
+bshuf_untrans_bit_elem_scal(void* in, void* out, const size_t size,
+                            const size_t elem_size, void* tmp_buf);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BITSHUFFLE_GENERIC_H */
diff --git a/thirdparty/blosc/blosc-export.h b/thirdparty/blosc/blosc-export.h
new file mode 100644
index 0000000000000000000000000000000000000000..49df9296b5276e6df3fb21e05dad50f74576cd93
--- /dev/null
+++ b/thirdparty/blosc/blosc-export.h
@@ -0,0 +1,45 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+#ifndef BLOSC_EXPORT_H
+#define BLOSC_EXPORT_H
+
+/* Macros for specifying exported symbols.
+   BLOSC_EXPORT is used to decorate symbols that should be
+   exported by the blosc shared library.
+   BLOSC_NO_EXPORT is used to decorate symbols that should NOT
+   be exported by the blosc shared library.
+*/
+#if defined(BLOSC_SHARED_LIBRARY)
+  #if defined(_MSC_VER)
+    #define BLOSC_EXPORT __declspec(dllexport)
+  #elif (defined(__GNUC__) && __GNUC__ >= 4) || defined(__clang__)
+    #if defined(_WIN32) || defined(__CYGWIN__) || defined(__MINGW32__)
+      #define BLOSC_EXPORT __attribute__((dllexport))
+    #else
+      #define BLOSC_EXPORT __attribute__((visibility("default")))
+    #endif  /* defined(_WIN32) || defined(__CYGWIN__) */
+  #else
+    #error Cannot determine how to define BLOSC_EXPORT for this compiler.
+  #endif
+#else
+  #define BLOSC_EXPORT
+#endif  /* defined(BLOSC_SHARED_LIBRARY) */
+
+#if defined(__GNUC__) || defined(__clang__)
+  #define BLOSC_NO_EXPORT __attribute__((visibility("hidden")))
+#else
+  #define BLOSC_NO_EXPORT
+#endif  /* defined(__GNUC__) || defined(__clang__) */
+
+/* When testing, export everything to make it easier to implement tests. */
+#if defined(BLOSC_TESTING)
+  #undef BLOSC_NO_EXPORT
+  #define BLOSC_NO_EXPORT BLOSC_EXPORT
+#endif  /* defined(BLOSC_TESTING) */
+
+#endif  /* BLOSC_EXPORT_H */
diff --git a/thirdparty/blosc/blosc.c b/thirdparty/blosc/blosc.c
index fdc82b5d2d2b439ba2c3b7e0abcaad1ea12e4e4e..c3ed70197f6349ed3fe707853ee84612d34c3360 100644
--- a/thirdparty/blosc/blosc.c
+++ b/thirdparty/blosc/blosc.c
@@ -1,26 +1,51 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
-  Author: Francesc Alted <faltet@gmail.com>
+  Author: Francesc Alted <francesc@blosc.org>
   Creation date: 2009-05-20
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
 **********************************************************************/
 
 
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <assert.h>
+#if defined(USING_CMAKE)
+  #include "config.h"
+#endif /*  USING_CMAKE */
 #include "blosc.h"
-#include "blosclz.h"
 #include "shuffle.h"
+#include "blosclz.h"
+#if defined(HAVE_LZ4)
+  #include "lz4.h"
+  #include "lz4hc.h"
+#endif /*  HAVE_LZ4 */
+#if defined(HAVE_SNAPPY)
+  #include "snappy-c.h"
+#endif /*  HAVE_SNAPPY */
+#if defined(HAVE_ZLIB)
+  #include "zlib.h"
+#endif /*  HAVE_ZLIB */
+#if defined(HAVE_ZSTD)
+  #include "zstd.h"
+#endif /*  HAVE_ZSTD */
 
 #if defined(_WIN32) && !defined(__MINGW32__)
   #include <windows.h>
-  #include "win32/stdint-windows.h"
+  #include <malloc.h>
+
+  /* stdint.h only available in VS2010 (VC++ 16.0) and newer */
+  #if defined(_MSC_VER) && _MSC_VER < 1600
+    #include "win32/stdint-windows.h"
+  #else
+    #include <stdint.h>
+  #endif
+
   #include <process.h>
   #define getpid _getpid
 #else
@@ -29,13 +54,18 @@
   #include <inttypes.h>
 #endif  /* _WIN32 */
 
-#if defined(_WIN32)
+#if defined(_WIN32) && !defined(__GNUC__)
   #include "win32/pthread.h"
   #include "win32/pthread.c"
 #else
   #include <pthread.h>
 #endif
 
+/* If C11 is supported, use it's built-in aligned allocation. */
+#if __STDC_VERSION__ >= 201112L
+  #include <stdalign.h>
+#endif
+
 
 /* Some useful units */
 #define KB 1024
@@ -50,119 +80,123 @@
 /* The size of L1 cache.  32 KB is quite common nowadays. */
 #define L1 (32*KB)
 
-/* Wrapped function to adjust the number of threads used by blosc */
-int blosc_set_nthreads_(int);
-
-/* Global variables for main logic */
-static int32_t init_temps_done = 0;    /* temp for compr/decompr initialized? */
-static int32_t force_blocksize = 0;    /* force the use of a blocksize? */
-static int pid = 0;                    /* the PID for this process */
-static int init_lib = 0;               /* is library initalized? */
-
-/* Global variables for threads */
-static int32_t nthreads = 1;            /* number of desired threads in pool */
-static int32_t init_threads_done = 0;   /* pool of threads initialized? */
-static int32_t end_threads = 0;         /* should exisiting threads end? */
-static int32_t init_sentinels_done = 0; /* sentinels initialized? */
-static int32_t giveup_code;             /* error code when give up */
-static int32_t nblock;                  /* block counter */
-static pthread_t threads[BLOSC_MAX_THREADS];  /* opaque structure for threads */
-static int32_t tids[BLOSC_MAX_THREADS];       /* ID per each thread */
-#if !defined(_WIN32)
-static pthread_attr_t ct_attr;          /* creation time attrs for threads */
-#endif
-
 /* Have problems using posix barriers when symbol value is 200112L */
 /* This requires more investigation, but will work for the moment */
 #if defined(_POSIX_BARRIERS) && ( (_POSIX_BARRIERS - 20012L) >= 0 && _POSIX_BARRIERS != 200112L)
 #define _POSIX_BARRIERS_MINE
 #endif
-
 /* Synchronization variables */
-static pthread_mutex_t count_mutex;
-static pthread_mutex_t global_comp_mutex;
-#ifdef _POSIX_BARRIERS_MINE
-static pthread_barrier_t barr_init;
-static pthread_barrier_t barr_finish;
-#else
-static int32_t count_threads;
-static pthread_mutex_t count_threads_mutex;
-static pthread_cond_t count_threads_cv;
-#endif
 
 
-/* Structure for parameters in (de-)compression threads */
-static struct thread_data {
-  int32_t typesize;
-  int32_t blocksize;
-  int32_t compress;
-  int32_t clevel;
-  int32_t flags;
-  int32_t memcpyed;
-  int32_t ntbytes;
-  int32_t nbytes;
-  int32_t maxbytes;
-  int32_t nblocks;
-  int32_t leftover;
-  int32_t *bstarts;             /* start pointers for each block */
-  uint8_t *src;
-  uint8_t *dest;
-  uint8_t *tmp[BLOSC_MAX_THREADS];
-  uint8_t *tmp2[BLOSC_MAX_THREADS];
-} params;
+struct blosc_context {
+  int32_t compress;               /* 1 if we are doing compression 0 if decompress */
+
+  const uint8_t* src;
+  uint8_t* dest;                  /* The current pos in the destination buffer */
+  uint8_t* header_flags;          /* Flags for header.  Currently booked:
+                                    - 0: byte-shuffled?
+                                    - 1: memcpy'ed?
+                                    - 2: bit-shuffled? */
+  int32_t sourcesize;             /* Number of bytes in source buffer (or uncompressed bytes in compressed file) */
+  int32_t nblocks;                /* Number of total blocks in buffer */
+  int32_t leftover;               /* Extra bytes at end of buffer */
+  int32_t blocksize;              /* Length of the block in bytes */
+  int32_t typesize;               /* Type size */
+  int32_t num_output_bytes;       /* Counter for the number of output bytes */
+  int32_t destsize;               /* Maximum size for destination buffer */
+  uint8_t* bstarts;               /* Start of the buffer past header info */
+  int32_t compcode;               /* Compressor code to use */
+  int clevel;                     /* Compression level (1-9) */
+
+  /* Threading */
+  int32_t numthreads;
+  int32_t threads_started;
+  int32_t end_threads;
+  pthread_t threads[BLOSC_MAX_THREADS];
+  int32_t tids[BLOSC_MAX_THREADS];
+  pthread_mutex_t count_mutex;
+  #ifdef _POSIX_BARRIERS_MINE
+  pthread_barrier_t barr_init;
+  pthread_barrier_t barr_finish;
+  #else
+  int32_t count_threads;
+  pthread_mutex_t count_threads_mutex;
+  pthread_cond_t count_threads_cv;
+  #endif
+  #if !defined(_WIN32)
+  pthread_attr_t ct_attr;            /* creation time attrs for threads */
+  #endif
+  int32_t thread_giveup_code;               /* error code when give up */
+  int32_t thread_nblock;                    /* block counter */
+};
+
+struct thread_context {
+  struct blosc_context* parent_context;
+  int32_t tid;
+  uint8_t* tmp;
+  uint8_t* tmp2;
+  uint8_t* tmp3;
+  int32_t tmpblocksize; /* Used to keep track of how big the temporary buffers are */
+};
+
+/* Global context for non-contextual API */
+static struct blosc_context* g_global_context;
+static pthread_mutex_t global_comp_mutex;
+static int32_t g_compressor = BLOSC_BLOSCLZ;  /* the compressor to use by default */
+static int32_t g_threads = 1;
+static int32_t g_force_blocksize = 0;
+static int32_t g_initlib = 0;
 
 
-/* Structure for parameters meant for keeping track of current temporaries */
-static struct temp_data {
-  int32_t nthreads;
-  int32_t typesize;
-  int32_t blocksize;
-} current_temp;
 
+/* Wrapped function to adjust the number of threads used by blosc */
+int blosc_set_nthreads_(struct blosc_context*);
+
+/* Releases the global threadpool */
+int blosc_release_threadpool(struct blosc_context* context);
 
 /* Macros for synchronization */
 
 /* Wait until all threads are initialized */
 #ifdef _POSIX_BARRIERS_MINE
-static int rc;
-#define WAIT_INIT \
-  rc = pthread_barrier_wait(&barr_init); \
+#define WAIT_INIT(RET_VAL, CONTEXT_PTR)  \
+  rc = pthread_barrier_wait(&CONTEXT_PTR->barr_init); \
   if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) { \
-    printf("Could not wait on barrier (init)\n"); \
-    return(-1); \
+    printf("Could not wait on barrier (init): %d\n", rc); \
+    return((RET_VAL));                            \
   }
 #else
-#define WAIT_INIT \
-  pthread_mutex_lock(&count_threads_mutex); \
-  if (count_threads < nthreads) { \
-    count_threads++; \
-    pthread_cond_wait(&count_threads_cv, &count_threads_mutex); \
+#define WAIT_INIT(RET_VAL, CONTEXT_PTR)   \
+  pthread_mutex_lock(&CONTEXT_PTR->count_threads_mutex); \
+  if (CONTEXT_PTR->count_threads < CONTEXT_PTR->numthreads) { \
+    CONTEXT_PTR->count_threads++;  \
+    pthread_cond_wait(&CONTEXT_PTR->count_threads_cv, &CONTEXT_PTR->count_threads_mutex); \
   } \
   else { \
-    pthread_cond_broadcast(&count_threads_cv); \
+    pthread_cond_broadcast(&CONTEXT_PTR->count_threads_cv); \
   } \
-  pthread_mutex_unlock(&count_threads_mutex);
+  pthread_mutex_unlock(&CONTEXT_PTR->count_threads_mutex);
 #endif
 
 /* Wait for all threads to finish */
 #ifdef _POSIX_BARRIERS_MINE
-#define WAIT_FINISH \
-  rc = pthread_barrier_wait(&barr_finish); \
+#define WAIT_FINISH(RET_VAL, CONTEXT_PTR)   \
+  rc = pthread_barrier_wait(&CONTEXT_PTR->barr_finish); \
   if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) { \
     printf("Could not wait on barrier (finish)\n"); \
-    return(-1);                                       \
+    return((RET_VAL));                              \
   }
 #else
-#define WAIT_FINISH \
-  pthread_mutex_lock(&count_threads_mutex); \
-  if (count_threads > 0) { \
-    count_threads--; \
-    pthread_cond_wait(&count_threads_cv, &count_threads_mutex); \
+#define WAIT_FINISH(RET_VAL, CONTEXT_PTR)                           \
+  pthread_mutex_lock(&CONTEXT_PTR->count_threads_mutex); \
+  if (CONTEXT_PTR->count_threads > 0) { \
+    CONTEXT_PTR->count_threads--; \
+    pthread_cond_wait(&CONTEXT_PTR->count_threads_cv, &CONTEXT_PTR->count_threads_mutex); \
   } \
   else { \
-    pthread_cond_broadcast(&count_threads_cv); \
+    pthread_cond_broadcast(&CONTEXT_PTR->count_threads_cv); \
   } \
-  pthread_mutex_unlock(&count_threads_mutex);
+  pthread_mutex_unlock(&CONTEXT_PTR->count_threads_mutex);
 #endif
 
 
@@ -172,15 +206,19 @@ static uint8_t *my_malloc(size_t size)
   void *block = NULL;
   int res = 0;
 
-#if defined(_WIN32)
+/* Do an alignment to 32 bytes because AVX2 is supported */
+#if _ISOC11_SOURCE
+  /* C11 aligned allocation. 'size' must be a multiple of the alignment. */
+  block = aligned_alloc(32, size);
+#elif defined(_WIN32)
   /* A (void *) cast needed for avoiding a warning with MINGW :-/ */
-  block = (void *)_aligned_malloc(size, 16);
+  block = (void *)_aligned_malloc(size, 32);
 #elif defined __APPLE__
   /* Mac OS X guarantees 16-byte alignment in small allocs */
   block = malloc(size);
 #elif _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600
   /* Platform does have an implementation of posix_memalign */
-  res = posix_memalign(&block, 16, size);
+  res = posix_memalign(&block, 32, size);
 #else
   block = malloc(size);
 #endif  /* _WIN32 */
@@ -205,52 +243,353 @@ static void my_free(void *block)
 }
 
 
-/* If `a` is little-endian, return it as-is.  If not, return a copy,
-   with the endianness changed */
-static int32_t sw32(int32_t a)
+/* Copy 4 bytes from `*pa` to int32_t, changing endianness if necessary. */
+static int32_t sw32_(const uint8_t *pa)
+{
+  int32_t idest;
+  uint8_t *dest = (uint8_t *)&idest;
+  int i = 1;                    /* for big/little endian detection */
+  char *p = (char *)&i;
+
+  if (p[0] != 1) {
+    /* big endian */
+    dest[0] = pa[3];
+    dest[1] = pa[2];
+    dest[2] = pa[1];
+    dest[3] = pa[0];
+  }
+  else {
+    /* little endian */
+    dest[0] = pa[0];
+    dest[1] = pa[1];
+    dest[2] = pa[2];
+    dest[3] = pa[3];
+  }
+  return idest;
+}
+
+
+/* Copy 4 bytes from `*pa` to `*dest`, changing endianness if necessary. */
+static void _sw32(uint8_t* dest, int32_t a)
 {
-  int32_t tmp;
-  char *pa = (char *)&a;
-  char *ptmp = (char *)&tmp;
+  uint8_t *pa = (uint8_t *)&a;
   int i = 1;                    /* for big/little endian detection */
   char *p = (char *)&i;
 
   if (p[0] != 1) {
     /* big endian */
-    ptmp[0] = pa[3];
-    ptmp[1] = pa[2];
-    ptmp[2] = pa[1];
-    ptmp[3] = pa[0];
-    return tmp;
+    dest[0] = pa[3];
+    dest[1] = pa[2];
+    dest[2] = pa[1];
+    dest[3] = pa[0];
   }
   else {
     /* little endian */
-    return a;
+    dest[0] = pa[0];
+    dest[1] = pa[1];
+    dest[2] = pa[2];
+    dest[3] = pa[3];
   }
 }
 
 
+/*
+ * Conversion routines between compressor and compression libraries
+ */
+
+/* Return the library code associated with the compressor name */
+static int compname_to_clibcode(const char *compname)
+{
+  if (strcmp(compname, BLOSC_BLOSCLZ_COMPNAME) == 0)
+    return BLOSC_BLOSCLZ_LIB;
+  if (strcmp(compname, BLOSC_LZ4_COMPNAME) == 0)
+    return BLOSC_LZ4_LIB;
+  if (strcmp(compname, BLOSC_LZ4HC_COMPNAME) == 0)
+    return BLOSC_LZ4_LIB;
+  if (strcmp(compname, BLOSC_SNAPPY_COMPNAME) == 0)
+    return BLOSC_SNAPPY_LIB;
+  if (strcmp(compname, BLOSC_ZLIB_COMPNAME) == 0)
+    return BLOSC_ZLIB_LIB;
+  if (strcmp(compname, BLOSC_ZSTD_COMPNAME) == 0)
+    return BLOSC_ZSTD_LIB;
+  return -1;
+}
+
+/* Return the library name associated with the compressor code */
+static char *clibcode_to_clibname(int clibcode)
+{
+  if (clibcode == BLOSC_BLOSCLZ_LIB) return BLOSC_BLOSCLZ_LIBNAME;
+  if (clibcode == BLOSC_LZ4_LIB) return BLOSC_LZ4_LIBNAME;
+  if (clibcode == BLOSC_SNAPPY_LIB) return BLOSC_SNAPPY_LIBNAME;
+  if (clibcode == BLOSC_ZLIB_LIB) return BLOSC_ZLIB_LIBNAME;
+  if (clibcode == BLOSC_ZSTD_LIB) return BLOSC_ZSTD_LIBNAME;
+  return NULL;                  /* should never happen */
+}
+
+
+/*
+ * Conversion routines between compressor names and compressor codes
+ */
+
+/* Get the compressor name associated with the compressor code */
+int blosc_compcode_to_compname(int compcode, char **compname)
+{
+  int code = -1;    /* -1 means non-existent compressor code */
+  char *name = NULL;
+
+  /* Map the compressor code */
+  if (compcode == BLOSC_BLOSCLZ)
+    name = BLOSC_BLOSCLZ_COMPNAME;
+  else if (compcode == BLOSC_LZ4)
+    name = BLOSC_LZ4_COMPNAME;
+  else if (compcode == BLOSC_LZ4HC)
+    name = BLOSC_LZ4HC_COMPNAME;
+  else if (compcode == BLOSC_SNAPPY)
+    name = BLOSC_SNAPPY_COMPNAME;
+  else if (compcode == BLOSC_ZLIB)
+    name = BLOSC_ZLIB_COMPNAME;
+  else if (compcode == BLOSC_ZSTD)
+    name = BLOSC_ZSTD_COMPNAME;
+
+  *compname = name;
+
+  /* Guess if there is support for this code */
+  if (compcode == BLOSC_BLOSCLZ)
+    code = BLOSC_BLOSCLZ;
+#if defined(HAVE_LZ4)
+  else if (compcode == BLOSC_LZ4)
+    code = BLOSC_LZ4;
+  else if (compcode == BLOSC_LZ4HC)
+    code = BLOSC_LZ4HC;
+#endif /*  HAVE_LZ4 */
+#if defined(HAVE_SNAPPY)
+  else if (compcode == BLOSC_SNAPPY)
+    code = BLOSC_SNAPPY;
+#endif /*  HAVE_SNAPPY */
+#if defined(HAVE_ZLIB)
+  else if (compcode == BLOSC_ZLIB)
+    code = BLOSC_ZLIB;
+#endif /*  HAVE_ZLIB */
+#if defined(HAVE_ZSTD)
+  else if (compcode == BLOSC_ZSTD)
+    code = BLOSC_ZSTD;
+#endif /*  HAVE_ZSTD */
+
+  return code;
+}
+
+/* Get the compressor code for the compressor name. -1 if it is not available */
+int blosc_compname_to_compcode(const char *compname)
+{
+  int code = -1;  /* -1 means non-existent compressor code */
+
+  if (strcmp(compname, BLOSC_BLOSCLZ_COMPNAME) == 0) {
+    code = BLOSC_BLOSCLZ;
+  }
+#if defined(HAVE_LZ4)
+  else if (strcmp(compname, BLOSC_LZ4_COMPNAME) == 0) {
+    code = BLOSC_LZ4;
+  }
+  else if (strcmp(compname, BLOSC_LZ4HC_COMPNAME) == 0) {
+    code = BLOSC_LZ4HC;
+  }
+#endif /*  HAVE_LZ4 */
+#if defined(HAVE_SNAPPY)
+  else if (strcmp(compname, BLOSC_SNAPPY_COMPNAME) == 0) {
+    code = BLOSC_SNAPPY;
+  }
+#endif /*  HAVE_SNAPPY */
+#if defined(HAVE_ZLIB)
+  else if (strcmp(compname, BLOSC_ZLIB_COMPNAME) == 0) {
+    code = BLOSC_ZLIB;
+  }
+#endif /*  HAVE_ZLIB */
+#if defined(HAVE_ZSTD)
+  else if (strcmp(compname, BLOSC_ZSTD_COMPNAME) == 0) {
+    code = BLOSC_ZSTD;
+  }
+#endif /*  HAVE_ZSTD */
+
+return code;
+}
+
+
+#if defined(HAVE_LZ4)
+static int lz4_wrap_compress(const char* input, size_t input_length,
+                             char* output, size_t maxout, int accel)
+{
+  int cbytes;
+  cbytes = LZ4_compress_fast(input, output, (int)input_length, (int)maxout,
+                             accel);
+  return cbytes;
+}
+
+static int lz4hc_wrap_compress(const char* input, size_t input_length,
+                               char* output, size_t maxout, int clevel)
+{
+  int cbytes;
+  if (input_length > (size_t)(2<<30))
+    return -1;   /* input larger than 1 GB is not supported */
+  /* clevel for lz4hc goes up to 16, at least in LZ4 1.1.3 */
+  cbytes = LZ4_compressHC2_limitedOutput(input, output, (int)input_length,
+                                         (int)maxout, clevel*2-1);
+  return cbytes;
+}
+
+static int lz4_wrap_decompress(const char* input, size_t compressed_length,
+                               char* output, size_t maxout)
+{
+  size_t cbytes;
+  cbytes = LZ4_decompress_fast(input, output, (int)maxout);
+  if (cbytes != compressed_length) {
+    return 0;
+  }
+  return (int)maxout;
+}
+
+#endif /* HAVE_LZ4 */
+
+#if defined(HAVE_SNAPPY)
+static int snappy_wrap_compress(const char* input, size_t input_length,
+                                char* output, size_t maxout)
+{
+  snappy_status status;
+  size_t cl = maxout;
+  status = snappy_compress(input, input_length, output, &cl);
+  if (status != SNAPPY_OK){
+    return 0;
+  }
+  return (int)cl;
+}
+
+static int snappy_wrap_decompress(const char* input, size_t compressed_length,
+                                  char* output, size_t maxout)
+{
+  snappy_status status;
+  size_t ul = maxout;
+  status = snappy_uncompress(input, compressed_length, output, &ul);
+  if (status != SNAPPY_OK){
+    return 0;
+  }
+  return (int)ul;
+}
+#endif /* HAVE_SNAPPY */
+
+#if defined(HAVE_ZLIB)
+/* zlib is not very respectful with sharing name space with others.
+ Fortunately, its names do not collide with those already in blosc. */
+static int zlib_wrap_compress(const char* input, size_t input_length,
+                              char* output, size_t maxout, int clevel)
+{
+  int status;
+  uLongf cl = maxout;
+  status = compress2(
+             (Bytef*)output, &cl, (Bytef*)input, (uLong)input_length, clevel);
+  if (status != Z_OK){
+    return 0;
+  }
+  return (int)cl;
+}
+
+static int zlib_wrap_decompress(const char* input, size_t compressed_length,
+                                char* output, size_t maxout)
+{
+  int status;
+  uLongf ul = maxout;
+  status = uncompress(
+             (Bytef*)output, &ul, (Bytef*)input, (uLong)compressed_length);
+  if (status != Z_OK){
+    return 0;
+  }
+  return (int)ul;
+}
+#endif /*  HAVE_ZLIB */
+
+#if defined(HAVE_ZSTD)
+static int zstd_wrap_compress(const char* input, size_t input_length,
+                              char* output, size_t maxout, int clevel) {
+  size_t code;
+  // clevel = (clevel < 9) ? clevel * 2 - 1 : ZSTD_maxCLevel();  // see zstd#254
+  clevel = (clevel < 9) ? clevel * 2 - 1 : 22;
+  code = ZSTD_compress(
+      (void*)output, maxout, (void*)input, input_length, clevel);
+  if (ZSTD_isError(code)) {
+    return 0;
+  }
+  return (int)code;
+}
+
+static int zstd_wrap_decompress(const char* input, size_t compressed_length,
+                                char* output, size_t maxout) {
+  size_t code;
+  code = ZSTD_decompress(
+      (void*)output, maxout, (void*)input, compressed_length);
+  if (ZSTD_isError(code)) {
+    fprintf(stderr, "error decompressing with Zstd: %s \n", ZSTD_getErrorName(code));
+    return 0;
+  }
+  return (int)code;
+}
+#endif /*  HAVE_ZSTD */
+
+/* Compute acceleration for blosclz */
+static int get_accel(const struct blosc_context* context) {
+  int32_t clevel = context->clevel;
+  int32_t typesize = context->typesize;
+
+  if (clevel == 9) {
+    return 1;
+  }
+  if (context->compcode == BLOSC_BLOSCLZ) {
+    /* Compute the power of 2. See:
+     * http://www.exploringbinary.com/ten-ways-to-check-if-an-integer-is-a-power-of-two-in-c/
+     */
+    int32_t tspow2 = ((typesize != 0) && !(typesize & (typesize - 1)));
+    if (tspow2 && typesize < 32) {
+      return 32;
+    }
+  }
+  else if (context->compcode == BLOSC_LZ4) {
+    /* This acceleration setting based on discussions held in:
+     * https://groups.google.com/forum/#!topic/lz4c/zosy90P8MQw
+     */
+    return (10 - clevel);
+  }
+  return 1;
+}
+
 /* Shuffle & compress a single block */
-static int blosc_c(int32_t blocksize, int32_t leftoverblock,
-                   int32_t ntbytes, int32_t maxbytes,
-                   uint8_t *src, uint8_t *dest, uint8_t *tmp)
+static int blosc_c(const struct blosc_context* context, int32_t blocksize,
+                   int32_t leftoverblock, int32_t ntbytes, int32_t maxbytes,
+                   const uint8_t *src, uint8_t *dest, uint8_t *tmp,
+                   uint8_t *tmp2)
 {
   int32_t j, neblock, nsplits;
   int32_t cbytes;                   /* number of compressed bytes in split */
   int32_t ctbytes = 0;              /* number of compressed bytes in block */
   int32_t maxout;
-  int32_t typesize = params.typesize;
-  uint8_t *_tmp;
-
-  if ((params.flags & BLOSC_DOSHUFFLE) && (typesize > 1)) {
-    /* Shuffle this block (this makes sense only if typesize > 1) */
+  int32_t typesize = context->typesize;
+  const uint8_t *_tmp = src;
+  char *compname;
+  int accel;
+  int bscount;
+
+  if (*(context->header_flags) & BLOSC_DOSHUFFLE & (typesize > 1)) {
+    /* Byte shuffling only makes sense if typesize > 1 */
     shuffle(typesize, blocksize, src, tmp);
     _tmp = tmp;
   }
-  else {
-    _tmp = src;
+  /* We don't allow more than 1 filter at the same time (yet) */
+  else if (*(context->header_flags) & BLOSC_DOBITSHUFFLE) {
+    bscount = bitshuffle(typesize, blocksize, src, tmp, tmp2);
+    if (bscount < 0)
+      return bscount;
+    _tmp = tmp;
   }
 
+  /* Calculate acceleration for different compressors */
+  accel = get_accel(context);
+
   /* Compress for each shuffled slice split for this block. */
   /* If typesize is too large, neblock is too small or we are in a
      leftover block, do not split at all. */
@@ -267,24 +606,70 @@ static int blosc_c(int32_t blocksize, int32_t leftoverblock,
     ntbytes += (int32_t)sizeof(int32_t);
     ctbytes += (int32_t)sizeof(int32_t);
     maxout = neblock;
+    #if defined(HAVE_SNAPPY)
+    if (context->compcode == BLOSC_SNAPPY) {
+      /* TODO perhaps refactor this to keep the value stashed somewhere */
+      maxout = snappy_max_compressed_length(neblock);
+    }
+    #endif /*  HAVE_SNAPPY */
     if (ntbytes+maxout > maxbytes) {
       maxout = maxbytes - ntbytes;   /* avoid buffer overrun */
       if (maxout <= 0) {
         return 0;                  /* non-compressible block */
       }
     }
-    cbytes = blosclz_compress(params.clevel, _tmp+j*neblock, neblock,
-                              dest, maxout);
-    if (cbytes >= maxout) {
-      /* Buffer overrun caused by blosclz_compress (should never happen) */
+    if (context->compcode == BLOSC_BLOSCLZ) {
+      cbytes = blosclz_compress(context->clevel, _tmp+j*neblock, neblock,
+                                dest, maxout, accel);
+    }
+    #if defined(HAVE_LZ4)
+    else if (context->compcode == BLOSC_LZ4) {
+      cbytes = lz4_wrap_compress((char *)_tmp+j*neblock, (size_t)neblock,
+                                 (char *)dest, (size_t)maxout, accel);
+    }
+    else if (context->compcode == BLOSC_LZ4HC) {
+      cbytes = lz4hc_wrap_compress((char *)_tmp+j*neblock, (size_t)neblock,
+                                   (char *)dest, (size_t)maxout,
+                                   context->clevel);
+    }
+    #endif /* HAVE_LZ4 */
+    #if defined(HAVE_SNAPPY)
+    else if (context->compcode == BLOSC_SNAPPY) {
+      cbytes = snappy_wrap_compress((char *)_tmp+j*neblock, (size_t)neblock,
+                                    (char *)dest, (size_t)maxout);
+    }
+    #endif /* HAVE_SNAPPY */
+    #if defined(HAVE_ZLIB)
+    else if (context->compcode == BLOSC_ZLIB) {
+      cbytes = zlib_wrap_compress((char *)_tmp+j*neblock, (size_t)neblock,
+                                  (char *)dest, (size_t)maxout,
+                                  context->clevel);
+    }
+    #endif /* HAVE_ZLIB */
+    #if defined(HAVE_ZSTD)
+    else if (context->compcode == BLOSC_ZSTD) {
+      cbytes = zstd_wrap_compress((char*)_tmp + j * neblock, (size_t)neblock,
+                                  (char*)dest, (size_t)maxout, context->clevel);
+    }
+    #endif /* HAVE_ZSTD */
+
+    else {
+      blosc_compcode_to_compname(context->compcode, &compname);
+      fprintf(stderr, "Blosc has not been compiled with '%s' ", compname);
+      fprintf(stderr, "compression support.  Please use one having it.");
+      return -5;    /* signals no compression support */
+    }
+
+    if (cbytes > maxout) {
+      /* Buffer overrun caused by compression (should never happen) */
       return -1;
     }
     else if (cbytes < 0) {
       /* cbytes should never be negative */
       return -2;
     }
-    else if (cbytes == 0) {
-      /* The compressor has been unable to compress data significantly. */
+    else if (cbytes == 0 || cbytes == neblock) {
+      /* The compressor has been unable to compress data at all. */
       /* Before doing the copy, check that we are not running into a
          buffer overflow. */
       if ((ntbytes+neblock) > maxbytes) {
@@ -293,7 +678,7 @@ static int blosc_c(int32_t blocksize, int32_t leftoverblock,
       memcpy(dest, _tmp+j*neblock, neblock);
       cbytes = neblock;
     }
-    ((int32_t *)(dest))[-1] = sw32(cbytes);
+    _sw32(dest - 4, cbytes);
     dest += cbytes;
     ntbytes += cbytes;
     ctbytes += cbytes;
@@ -302,25 +687,27 @@ static int blosc_c(int32_t blocksize, int32_t leftoverblock,
   return ctbytes;
 }
 
-
 /* Decompress & unshuffle a single block */
-static int blosc_d(int32_t blocksize, int32_t leftoverblock,
-                   uint8_t *src, uint8_t *dest, uint8_t *tmp, uint8_t *tmp2)
+static int blosc_d(struct blosc_context* context, int32_t blocksize, int32_t leftoverblock,
+                   const uint8_t *src, uint8_t *dest, uint8_t *tmp, uint8_t *tmp2)
 {
   int32_t j, neblock, nsplits;
   int32_t nbytes;                /* number of decompressed bytes in split */
   int32_t cbytes;                /* number of compressed bytes in split */
   int32_t ctbytes = 0;           /* number of compressed bytes in block */
   int32_t ntbytes = 0;           /* number of uncompressed bytes in block */
-  uint8_t *_tmp;
-  int32_t typesize = params.typesize;
-
-  if ((params.flags & BLOSC_DOSHUFFLE) && (typesize > 1)) {
+  uint8_t *_tmp = dest;
+  int32_t typesize = context->typesize;
+  int32_t compformat;
+  char *compname;
+  int bscount;
+
+  if ((*(context->header_flags) & BLOSC_DOSHUFFLE & (typesize > 1)) ||  \
+      (*(context->header_flags) & BLOSC_DOBITSHUFFLE)) {
     _tmp = tmp;
   }
-  else {
-    _tmp = dest;
-  }
+
+  compformat = (*(context->header_flags) & 0xe0) >> 5;
 
   /* Compress for each shuffled slice split for this block. */
   if ((typesize <= MAX_SPLITS) && (blocksize/typesize) >= MIN_BUFFERSIZE &&
@@ -332,7 +719,7 @@ static int blosc_d(int32_t blocksize, int32_t leftoverblock,
   }
   neblock = blocksize / nsplits;
   for (j = 0; j < nsplits; j++) {
-    cbytes = sw32(((int32_t *)(src))[0]);   /* amount of compressed bytes */
+    cbytes = sw32_(src);      /* amount of compressed bytes */
     src += sizeof(int32_t);
     ctbytes += (int32_t)sizeof(int32_t);
     /* Uncompress */
@@ -341,10 +728,47 @@ static int blosc_d(int32_t blocksize, int32_t leftoverblock,
       nbytes = neblock;
     }
     else {
-      nbytes = blosclz_decompress(src, cbytes, _tmp, neblock);
+      if (compformat == BLOSC_BLOSCLZ_FORMAT) {
+        nbytes = blosclz_decompress(src, cbytes, _tmp, neblock);
+      }
+      #if defined(HAVE_LZ4)
+      else if (compformat == BLOSC_LZ4_FORMAT) {
+        nbytes = lz4_wrap_decompress((char *)src, (size_t)cbytes,
+                                     (char*)_tmp, (size_t)neblock);
+      }
+      #endif /*  HAVE_LZ4 */
+      #if defined(HAVE_SNAPPY)
+      else if (compformat == BLOSC_SNAPPY_FORMAT) {
+        nbytes = snappy_wrap_decompress((char *)src, (size_t)cbytes,
+                                        (char*)_tmp, (size_t)neblock);
+      }
+      #endif /*  HAVE_SNAPPY */
+      #if defined(HAVE_ZLIB)
+      else if (compformat == BLOSC_ZLIB_FORMAT) {
+        nbytes = zlib_wrap_decompress((char *)src, (size_t)cbytes,
+                                      (char*)_tmp, (size_t)neblock);
+      }
+      #endif /*  HAVE_ZLIB */
+      #if defined(HAVE_ZSTD)
+      else if (compformat == BLOSC_ZSTD_FORMAT) {
+        nbytes = zstd_wrap_decompress((char*)src, (size_t)cbytes,
+                                      (char*)_tmp, (size_t)neblock);
+      }
+      #endif /*  HAVE_ZSTD */
+      else {
+        compname = clibcode_to_clibname(compformat);
+        fprintf(stderr,
+                "Blosc has not been compiled with decompression "
+                "support for '%s' format. ", compname);
+        fprintf(stderr, "Please recompile for adding this support.\n");
+        return -5;    /* signals no decompression support */
+      }
+
+      /* Check that decompressed bytes number is correct */
       if (nbytes != neblock) {
-        return -2;
+          return -2;
       }
+
     }
     src += cbytes;
     ctbytes += cbytes;
@@ -352,19 +776,13 @@ static int blosc_d(int32_t blocksize, int32_t leftoverblock,
     ntbytes += nbytes;
   } /* Closes j < nsplits */
 
-  if ((params.flags & BLOSC_DOSHUFFLE) && (typesize > 1)) {
-    if ((uintptr_t)dest % 16 == 0) {
-      /* 16-bytes aligned dest.  SSE2 unshuffle will work. */
-      unshuffle(typesize, blocksize, tmp, dest);
-    }
-    else {
-      /* dest is not aligned.  Use tmp2, which is aligned, and copy. */
-      unshuffle(typesize, blocksize, tmp, tmp2);
-      if (tmp2 != dest) {
-        /* Copy only when dest is not tmp2 (e.g. not blosc_getitem())  */
-        memcpy(dest, tmp2, blocksize);
-      }
-    }
+  if (*(context->header_flags) & BLOSC_DOSHUFFLE & (typesize > 1)) {
+    unshuffle(typesize, blocksize, tmp, dest);
+  }
+  else if (*(context->header_flags) & BLOSC_DOBITSHUFFLE) {
+    bscount = bitunshuffle(typesize, blocksize, tmp, dest, tmp2);
+    if (bscount < 0)
+      return bscount;
   }
 
   /* Return the number of uncompressed bytes */
@@ -373,43 +791,40 @@ static int blosc_d(int32_t blocksize, int32_t leftoverblock,
 
 
 /* Serial version for compression/decompression */
-static int serial_blosc(void)
+static int serial_blosc(struct blosc_context* context)
 {
   int32_t j, bsize, leftoverblock;
   int32_t cbytes;
-  int32_t compress = params.compress;
-  int32_t blocksize = params.blocksize;
-  int32_t ntbytes = params.ntbytes;
-  int32_t flags = params.flags;
-  int32_t maxbytes = params.maxbytes;
-  int32_t nblocks = params.nblocks;
-  int32_t leftover = params.nbytes % params.blocksize;
-  int32_t *bstarts = params.bstarts;
-  uint8_t *src = params.src;
-  uint8_t *dest = params.dest;
-  uint8_t *tmp = params.tmp[0];     /* tmp for thread 0 */
-  uint8_t *tmp2 = params.tmp2[0];   /* tmp2 for thread 0 */
 
-  for (j = 0; j < nblocks; j++) {
-    if (compress && !(flags & BLOSC_MEMCPYED)) {
-      bstarts[j] = sw32(ntbytes);
+  int32_t ebsize = context->blocksize + context->typesize * (int32_t)sizeof(int32_t);
+  int32_t ntbytes = context->num_output_bytes;
+
+  uint8_t *tmp = my_malloc(context->blocksize + ebsize);
+  uint8_t *tmp2 = tmp + context->blocksize;
+
+  for (j = 0; j < context->nblocks; j++) {
+    if (context->compress && !(*(context->header_flags) & BLOSC_MEMCPYED)) {
+      _sw32(context->bstarts + j * 4, ntbytes);
     }
-    bsize = blocksize;
+    bsize = context->blocksize;
     leftoverblock = 0;
-    if ((j == nblocks - 1) && (leftover > 0)) {
-      bsize = leftover;
+    if ((j == context->nblocks - 1) && (context->leftover > 0)) {
+      bsize = context->leftover;
       leftoverblock = 1;
     }
-    if (compress) {
-      if (flags & BLOSC_MEMCPYED) {
+    if (context->compress) {
+      if (*(context->header_flags) & BLOSC_MEMCPYED) {
         /* We want to memcpy only */
-        memcpy(dest+BLOSC_MAX_OVERHEAD+j*blocksize, src+j*blocksize, bsize);
+        memcpy(context->dest+BLOSC_MAX_OVERHEAD+j*context->blocksize,
+                context->src+j*context->blocksize,
+                bsize);
         cbytes = bsize;
       }
       else {
         /* Regular compression */
-        cbytes = blosc_c(bsize, leftoverblock, ntbytes, maxbytes,
-                         src+j*blocksize, dest+ntbytes, tmp);
+        cbytes = blosc_c(context, bsize, leftoverblock, ntbytes,
+                         context->destsize, context->src+j*context->blocksize,
+                         context->dest+ntbytes, tmp, tmp2);
         if (cbytes == 0) {
           ntbytes = 0;              /* uncompressible data */
           break;
@@ -417,15 +832,18 @@ static int serial_blosc(void)
       }
     }
     else {
-      if (flags & BLOSC_MEMCPYED) {
+      if (*(context->header_flags) & BLOSC_MEMCPYED) {
         /* We want to memcpy only */
-        memcpy(dest+j*blocksize, src+BLOSC_MAX_OVERHEAD+j*blocksize, bsize);
+        memcpy(context->dest+j*context->blocksize,
+                context->src+BLOSC_MAX_OVERHEAD+j*context->blocksize,
+                bsize);
         cbytes = bsize;
       }
       else {
         /* Regular decompression */
-        cbytes = blosc_d(bsize, leftoverblock,
-                         src+sw32(bstarts[j]), dest+j*blocksize, tmp, tmp2);
+        cbytes = blosc_d(context, bsize, leftoverblock,
+                          context->src + sw32_(context->bstarts + j * 4),
+                          context->dest+j*context->blocksize, tmp, tmp2);
       }
     }
     if (cbytes < 0) {
@@ -435,124 +853,64 @@ static int serial_blosc(void)
     ntbytes += cbytes;
   }
 
+  // Free temporaries
+  my_free(tmp);
+
   return ntbytes;
 }
 
 
 /* Threaded version for compression/decompression */
-static int parallel_blosc(void)
+static int parallel_blosc(struct blosc_context* context)
 {
+  int rc;
 
   /* Check whether we need to restart threads */
-  if (!init_threads_done || pid != getpid()) {
-    blosc_set_nthreads_(nthreads);
-  }
+  blosc_set_nthreads_(context);
+
+  /* Set sentinels */
+  context->thread_giveup_code = 1;
+  context->thread_nblock = -1;
 
   /* Synchronization point for all threads (wait for initialization) */
-  WAIT_INIT;
+  WAIT_INIT(-1, context);
+
   /* Synchronization point for all threads (wait for finalization) */
-  WAIT_FINISH;
+  WAIT_FINISH(-1, context);
 
-  if (giveup_code > 0) {
+  if (context->thread_giveup_code > 0) {
     /* Return the total bytes (de-)compressed in threads */
-    return params.ntbytes;
+    return context->num_output_bytes;
   }
   else {
     /* Compression/decompression gave up.  Return error code. */
-    return giveup_code;
-  }
-}
-
-
-/* Convenience functions for creating and releasing temporaries */
-static int create_temporaries(void)
-{
-  int32_t tid;
-  int32_t typesize = params.typesize;
-  int32_t blocksize = params.blocksize;
-  /* Extended blocksize for temporary destination.  Extended blocksize
-   is only useful for compression in parallel mode, but it doesn't
-   hurt serial mode either. */
-  int32_t ebsize = blocksize + typesize*(int32_t)sizeof(int32_t);
-
-  /* Create temporary area for each thread */
-  for (tid = 0; tid < nthreads; tid++) {
-    uint8_t *tmp = my_malloc(blocksize);
-    uint8_t *tmp2;
-    if (tmp == NULL) {
-      return -1;
-    }
-    params.tmp[tid] = tmp;
-    tmp2 = my_malloc(ebsize);
-    if (tmp2 == NULL) {
-      return -1;
-    }
-    params.tmp2[tid] = tmp2;
-  }
-
-  init_temps_done = 1;
-  /* Update params for current temporaries */
-  current_temp.nthreads = nthreads;
-  current_temp.typesize = typesize;
-  current_temp.blocksize = blocksize;
-  return 0;
-}
-
-
-static void release_temporaries(void)
-{
-  int32_t tid;
-
-  /* Release buffers */
-  for (tid = 0; tid < nthreads; tid++) {
-    my_free(params.tmp[tid]);
-    my_free(params.tmp2[tid]);
+    return context->thread_giveup_code;
   }
-
-  init_temps_done = 0;
 }
 
 
 /* Do the compression or decompression of the buffer depending on the
    global params. */
-static int do_job(void)
+static int do_job(struct blosc_context* context)
 {
   int32_t ntbytes;
 
-  /* Initialize/reset temporaries if needed */
-  if (!init_temps_done) {
-    int ret;
-    ret = create_temporaries();
-    if (ret < 0) {
-      return -1;
-    }
-  }
-  else if (current_temp.nthreads != nthreads ||
-           current_temp.typesize != params.typesize ||
-           current_temp.blocksize != params.blocksize) {
-    int ret;
-    release_temporaries();
-    ret = create_temporaries();
-    if (ret < 0) {
-      return -1;
-    }
-  }
-
   /* Run the serial version when nthreads is 1 or when the buffers are
      not much larger than blocksize */
-  if (nthreads == 1 || (params.nbytes / params.blocksize) <= 1) {
-    ntbytes = serial_blosc();
+  if (context->numthreads == 1 || (context->sourcesize / context->blocksize) <= 1) {
+    ntbytes = serial_blosc(context);
   }
   else {
-    ntbytes = parallel_blosc();
+    ntbytes = parallel_blosc(context);
   }
 
   return ntbytes;
 }
 
 
-static int32_t compute_blocksize(int32_t clevel, int32_t typesize,
-                                 int32_t nbytes)
+static int32_t compute_blocksize(struct blosc_context* context, int32_t clevel,
+                                 int32_t typesize, int32_t nbytes,
+                                 int32_t forced_blocksize)
 {
   int32_t blocksize;
 
@@ -563,32 +921,54 @@ static int32_t compute_blocksize(int32_t clevel, int32_t typesize,
 
   blocksize = nbytes;           /* Start by a whole buffer as blocksize */
 
-  if (force_blocksize) {
-    blocksize = force_blocksize;
-    /* Check that forced blocksize is not too small nor too large */
+  if (forced_blocksize) {
+    blocksize = forced_blocksize;
+    /* Check that forced blocksize is not too small */
     if (blocksize < MIN_BUFFERSIZE) {
       blocksize = MIN_BUFFERSIZE;
     }
   }
-  else if (nbytes >= L1*4) {
-    blocksize = L1 * 4;
+  else if (nbytes >= L1) {
+    blocksize = L1;
+
+    /* For LZ4HC, increase the block sizes by a factor of 8 because it
+       is meant for compressing large blocks (it shows a big overhead
+       when compressing small ones). */
+    if (context->compcode == BLOSC_LZ4HC) {
+      blocksize *= 8;
+    }
+
+    /* For Zlib, increase the block sizes by a factor of 8 because it
+       is meant for compressing large blocks (it shows a big overhead
+       when compressing small ones). */
+    if (context->compcode == BLOSC_ZLIB) {
+      blocksize *= 8;
+    }
+
+    /* For Zstd, increase the block sizes by a factor of 8 because it
+       is meant for compressing large blocks (it shows a big overhead
+       when compressing small ones). */
+    if (context->compcode == BLOSC_ZSTD) {
+      blocksize *= 8;
+    }
+
     if (clevel == 0) {
-      blocksize /= 16;
+      blocksize /= 4;
     }
     else if (clevel <= 3) {
-      blocksize /= 8;
+      blocksize /= 2;
     }
     else if (clevel <= 5) {
-      blocksize /= 4;
+      blocksize *= 1;
     }
     else if (clevel <= 6) {
-      blocksize /= 2;
+      blocksize *= 2;
     }
     else if (clevel < 9) {
-      blocksize *= 1;
+      blocksize *= 4;
     }
     else {
-      blocksize *= 2;
+      blocksize *= 16;
     }
   }
 
@@ -597,50 +977,47 @@ static int32_t compute_blocksize(int32_t clevel, int32_t typesize,
     blocksize = nbytes;
   }
 
-  /* blocksize must be a multiple of the typesize */
+  /* blocksize *must absolutely* be a multiple of the typesize */
   if (blocksize > typesize) {
     blocksize = blocksize / typesize * typesize;
   }
 
-  /* blocksize must not exceed (64 KB * typesize) in order to allow
-     BloscLZ to achieve better compression ratios (the ultimate reason
-     for this is that hash_log in BloscLZ cannot be larger than 15) */
-  if ((blocksize / typesize) > 64*KB) {
-    blocksize = 64 * KB * typesize;
-  }
-
   return blocksize;
 }
 
-
-/* The public routine for compression.  See blosc.h for docstrings. */
-int blosc_compress(int clevel, int doshuffle, size_t typesize, size_t nbytes,
-      const void *src, void *dest, size_t destsize)
+static int initialize_context_compression(struct blosc_context* context,
+                          int clevel,
+                          int doshuffle,
+                          size_t typesize,
+                          size_t sourcesize,
+                          const void* src,
+                          void* dest,
+                          size_t destsize,
+                          int32_t compressor,
+                          int32_t blocksize,
+                          int32_t numthreads)
 {
-  uint8_t *_dest=NULL;         /* current pos for destination buffer */
-  uint8_t *flags;              /* flags for header.  Currently booked:
-                                  - 0: shuffled?
-                                  - 1: memcpy'ed? */
-  int32_t nbytes_;            /* number of bytes in source buffer */
-  int32_t nblocks;            /* number of total blocks in buffer */
-  int32_t leftover;           /* extra bytes at end of buffer */
-  int32_t *bstarts;           /* start pointers for each block */
-  int32_t blocksize;          /* length of the block in bytes */
-  int32_t ntbytes = 0;        /* the number of compressed bytes */
-  int32_t *ntbytes_;          /* placeholder for bytes in output buffer */
-  int32_t maxbytes = (int32_t)destsize;  /* maximum size for dest buffer */
+  /* Set parameters */
+  context->compress = 1;
+  context->src = (const uint8_t*)src;
+  context->dest = (uint8_t *)(dest);
+  context->num_output_bytes = 0;
+  context->destsize = (int32_t)destsize;
+  context->sourcesize = sourcesize;
+  context->typesize = typesize;
+  context->compcode = compressor;
+  context->numthreads = numthreads;
+  context->end_threads = 0;
+  context->clevel = clevel;
 
   /* Check buffer size limits */
-  if (nbytes > BLOSC_MAX_BUFFERSIZE) {
+  if (sourcesize > BLOSC_MAX_BUFFERSIZE) {
     /* If buffer is too large, give up. */
     fprintf(stderr, "Input buffer size cannot exceed %d bytes\n",
             BLOSC_MAX_BUFFERSIZE);
     return -1;
   }
 
-  /* We can safely do this assignation now */
-  nbytes_ = (int32_t)nbytes;
-
   /* Compression level */
   if (clevel < 0 || clevel > 9) {
     /* If clevel not in 0..9, print an error */
@@ -649,206 +1026,410 @@ int blosc_compress(int clevel, int doshuffle, size_t typesize, size_t nbytes,
   }
 
   /* Shuffle */
-  if (doshuffle != 0 && doshuffle != 1) {
-    fprintf(stderr, "`shuffle` parameter must be either 0 or 1!\n");
+  if (doshuffle != 0 && doshuffle != 1 && doshuffle != 2) {
+    fprintf(stderr, "`shuffle` parameter must be either 0, 1 or 2!\n");
     return -10;
   }
 
   /* Check typesize limits */
-  if (typesize > BLOSC_MAX_TYPESIZE) {
+  if (context->typesize > BLOSC_MAX_TYPESIZE) {
     /* If typesize is too large, treat buffer as an 1-byte stream. */
-    typesize = 1;
+    context->typesize = 1;
   }
 
   /* Get the blocksize */
-  blocksize = compute_blocksize(clevel, (int32_t)typesize, nbytes_);
+  context->blocksize = compute_blocksize(context, clevel, (int32_t)context->typesize, context->sourcesize, blocksize);
 
   /* Compute number of blocks in buffer */
-  nblocks = nbytes_ / blocksize;
-  leftover = nbytes_ % blocksize;
-  nblocks = (leftover>0)? nblocks+1: nblocks;
+  context->nblocks = context->sourcesize / context->blocksize;
+  context->leftover = context->sourcesize % context->blocksize;
+  context->nblocks = (context->leftover > 0) ? (context->nblocks + 1) : context->nblocks;
+
+  return 1;
+}
+
+static int write_compression_header(struct blosc_context* context, int clevel, int doshuffle)
+{
+  int32_t compformat;
+
+  /* Write version header for this block */
+  context->dest[0] = BLOSC_VERSION_FORMAT;              /* blosc format version */
+
+  /* Write compressor format */
+  compformat = -1;
+  switch (context->compcode)
+  {
+  case BLOSC_BLOSCLZ:
+    compformat = BLOSC_BLOSCLZ_FORMAT;
+    context->dest[1] = BLOSC_BLOSCLZ_VERSION_FORMAT; /* blosclz format version */
+    break;
+
+#if defined(HAVE_LZ4)
+  case BLOSC_LZ4:
+    compformat = BLOSC_LZ4_FORMAT;
+    context->dest[1] = BLOSC_LZ4_VERSION_FORMAT;  /* lz4 format version */
+    break;
+  case BLOSC_LZ4HC:
+    compformat = BLOSC_LZ4HC_FORMAT;
+    context->dest[1] = BLOSC_LZ4HC_VERSION_FORMAT; /* lz4hc is the same as lz4 */
+    break;
+#endif /* HAVE_LZ4 */
+
+#if defined(HAVE_SNAPPY)
+  case BLOSC_SNAPPY:
+    compformat = BLOSC_SNAPPY_FORMAT;
+    context->dest[1] = BLOSC_SNAPPY_VERSION_FORMAT;    /* snappy format version */
+    break;
+#endif /* HAVE_SNAPPY */
+
+#if defined(HAVE_ZLIB)
+  case BLOSC_ZLIB:
+    compformat = BLOSC_ZLIB_FORMAT;
+    context->dest[1] = BLOSC_ZLIB_VERSION_FORMAT;      /* zlib format version */
+    break;
+#endif /* HAVE_ZLIB */
+
+#if defined(HAVE_ZSTD)
+  case BLOSC_ZSTD:
+    compformat = BLOSC_ZSTD_FORMAT;
+    context->dest[1] = BLOSC_ZSTD_VERSION_FORMAT;      /* zstd format version */
+    break;
+#endif /* HAVE_ZSTD */
+
+  default:
+  {
+    char *compname;
+    compname = clibcode_to_clibname(compformat);
+    fprintf(stderr, "Blosc has not been compiled with '%s' ", compname);
+    fprintf(stderr, "compression support.  Please use one having it.");
+    return -5;    /* signals no compression support */
+    break;
+  }
+  }
 
-  _dest = (uint8_t *)(dest);
-  /* Write header for this block */
-  _dest[0] = BLOSC_VERSION_FORMAT;         /* blosc format version */
-  _dest[1] = BLOSCLZ_VERSION_FORMAT;       /* blosclz format version */
-  flags = _dest+2;                         /* flags */
-  _dest[2] = 0;                            /* zeroes flags */
-  _dest[3] = (uint8_t)typesize;            /* type size */
-  _dest += 4;
-  ((int32_t *)_dest)[0] = sw32(nbytes_);  /* size of the buffer */
-  ((int32_t *)_dest)[1] = sw32(blocksize);/* block size */
-  ntbytes_ = (int32_t *)(_dest+8);        /* compressed buffer size */
-  _dest += sizeof(int32_t)*3;
-  bstarts = (int32_t *)_dest;             /* starts for every block */
-  _dest += sizeof(int32_t)*nblocks;        /* space for pointers to blocks */
-  ntbytes = (int32_t)(_dest - (uint8_t *)dest);
-
-  if (clevel == 0) {
+  context->header_flags = context->dest+2;  /* flags */
+  context->dest[2] = 0;  /* zeroes flags */
+  context->dest[3] = (uint8_t)context->typesize;  /* type size */
+  _sw32(context->dest + 4, context->sourcesize);  /* size of the buffer */
+  _sw32(context->dest + 8, context->blocksize);  /* block size */
+  context->bstarts = context->dest + 16;  /* starts for every block */
+  context->num_output_bytes = 16 + sizeof(int32_t)*context->nblocks;  /* space for header and pointers */
+
+  if (context->clevel == 0) {
     /* Compression level 0 means buffer to be memcpy'ed */
-    *flags |= BLOSC_MEMCPYED;
+    *(context->header_flags) |= BLOSC_MEMCPYED;
   }
 
-  if (nbytes_ < MIN_BUFFERSIZE) {
+  if (context->sourcesize < MIN_BUFFERSIZE) {
     /* Buffer is too small.  Try memcpy'ing. */
-    *flags |= BLOSC_MEMCPYED;
+    *(context->header_flags) |= BLOSC_MEMCPYED;
   }
 
-  if (doshuffle == 1) {
-    /* Shuffle is active */
-    *flags |= BLOSC_DOSHUFFLE;              /* bit 0 set to one in flags */
+  if (doshuffle == BLOSC_SHUFFLE) {
+    /* Byte-shuffle is active */
+    *(context->header_flags) |= BLOSC_DOSHUFFLE;     /* bit 0 set to one in flags */
   }
 
-  /* Take global lock for the time of compression */
-  pthread_mutex_lock(&global_comp_mutex);
-  /* Populate parameters for compression routines */
-  params.compress = 1;
-  params.clevel = clevel;
-  params.flags = (int32_t)*flags;
-  params.typesize = (int32_t)typesize;
-  params.blocksize = blocksize;
-  params.ntbytes = ntbytes;
-  params.nbytes = nbytes_;
-  params.maxbytes = maxbytes;
-  params.nblocks = nblocks;
-  params.leftover = leftover;
-  params.bstarts = bstarts;
-  params.src = (uint8_t *)src;
-  params.dest = (uint8_t *)dest;
-
-  if (!(*flags & BLOSC_MEMCPYED)) {
+  if (doshuffle == BLOSC_BITSHUFFLE) {
+    /* Bit-shuffle is active */
+    *(context->header_flags) |= BLOSC_DOBITSHUFFLE;  /* bit 2 set to one in flags */
+  }
+
+  *(context->header_flags) |= compformat << 5;      /* compressor format start at bit 5 */
+
+  return 1;
+}
+
+int blosc_compress_context(struct blosc_context* context)
+{
+  int32_t ntbytes = 0;
+
+  if (!(*(context->header_flags) & BLOSC_MEMCPYED)) {
     /* Do the actual compression */
-    ntbytes = do_job();
+    ntbytes = do_job(context);
     if (ntbytes < 0) {
       return -1;
     }
-    if ((ntbytes == 0) && (nbytes_+BLOSC_MAX_OVERHEAD <= maxbytes)) {
+    if ((ntbytes == 0) && (context->sourcesize+BLOSC_MAX_OVERHEAD <= context->destsize)) {
       /* Last chance for fitting `src` buffer in `dest`.  Update flags
        and do a memcpy later on. */
-      *flags |= BLOSC_MEMCPYED;
-      params.flags |= BLOSC_MEMCPYED;
+      *(context->header_flags) |= BLOSC_MEMCPYED;
     }
   }
 
-  if (*flags & BLOSC_MEMCPYED) {
-    if (nbytes_+BLOSC_MAX_OVERHEAD > maxbytes) {
+  if (*(context->header_flags) & BLOSC_MEMCPYED) {
+    if (context->sourcesize + BLOSC_MAX_OVERHEAD > context->destsize) {
       /* We are exceeding maximum output size */
       ntbytes = 0;
     }
-    else if (((nbytes_ % L1) == 0) || (nthreads > 1)) {
-      /* More effective with large buffers that are multiples of the
-       cache size or multi-cores */
-      params.ntbytes = BLOSC_MAX_OVERHEAD;
-      ntbytes = do_job();
-      if (ntbytes < 0) {
-	return -1;
-      }
-    }
     else {
-      memcpy((uint8_t *)dest+BLOSC_MAX_OVERHEAD, src, nbytes_);
-      ntbytes = nbytes_ + BLOSC_MAX_OVERHEAD;
+      memcpy(context->dest+BLOSC_MAX_OVERHEAD, context->src,
+             context->sourcesize);
+      ntbytes = context->sourcesize + BLOSC_MAX_OVERHEAD;
     }
   }
 
   /* Set the number of compressed bytes in header */
-  *ntbytes_ = sw32(ntbytes);
+  _sw32(context->dest + 12, ntbytes);
 
-  /* Release global lock */
-  pthread_mutex_unlock(&global_comp_mutex);
-  
-  assert((int32_t)ntbytes <= (int32_t)maxbytes);
+  assert(ntbytes <= context->destsize);
   return ntbytes;
 }
 
+/* The public routine for compression with context. */
+int blosc_compress_ctx(int clevel, int doshuffle, size_t typesize,
+                       size_t nbytes, const void* src, void* dest,
+                       size_t destsize, const char* compressor,
+                       size_t blocksize, int numinternalthreads)
+{
+  int error, result;
+  struct blosc_context context;
 
-/* The public routine for decompression.  See blosc.h for docstrings. */
-int blosc_decompress(const void *src, void *dest, size_t destsize)
+  context.threads_started = 0;
+  error = initialize_context_compression(&context, clevel, doshuffle, typesize,
+					 nbytes, src, dest, destsize,
+					 blosc_compname_to_compcode(compressor),
+					 blocksize, numinternalthreads);
+  if (error < 0) { return error; }
+
+  error = write_compression_header(&context, clevel, doshuffle);
+  if (error < 0) { return error; }
+
+  result = blosc_compress_context(&context);
+
+  if (numinternalthreads > 1)
+  {
+    blosc_release_threadpool(&context);
+  }
+
+  return result;
+}
+
+/* The public routine for compression.  See blosc.h for docstrings. */
+int blosc_compress(int clevel, int doshuffle, size_t typesize, size_t nbytes,
+                   const void *src, void *dest, size_t destsize)
 {
-  uint8_t *_src=NULL;            /* current pos for source buffer */
-  uint8_t version, versionlz;    /* versions for compressed header */
-  uint8_t flags;                 /* flags for header */
-  int32_t ntbytes;               /* the number of uncompressed bytes */
-  int32_t nblocks;              /* number of total blocks in buffer */
-  int32_t leftover;             /* extra bytes at end of buffer */
-  int32_t *bstarts;             /* start pointers for each block */
-  int32_t typesize, blocksize, nbytes, ctbytes;
+  int error;
+  int result;
+  char* envvar;
+
+  /* Check if should initialize */
+  if (!g_initlib) blosc_init();
+
+  /* Check for a BLOSC_CLEVEL environment variable */
+  envvar = getenv("BLOSC_CLEVEL");
+  if (envvar != NULL) {
+    long value;
+    value = strtol(envvar, NULL, 10);
+    if ((value != EINVAL) && (value >= 0)) {
+      clevel = (int)value;
+    }
+  }
 
-  _src = (uint8_t *)(src);
+  /* Check for a BLOSC_SHUFFLE environment variable */
+  envvar = getenv("BLOSC_SHUFFLE");
+  if (envvar != NULL) {
+    if (strcmp(envvar, "NOSHUFFLE") == 0) {
+      doshuffle = BLOSC_NOSHUFFLE;
+    }
+    if (strcmp(envvar, "SHUFFLE") == 0) {
+      doshuffle = BLOSC_SHUFFLE;
+    }
+    if (strcmp(envvar, "BITSHUFFLE") == 0) {
+      doshuffle = BLOSC_BITSHUFFLE;
+    }
+  }
+
+  /* Check for a BLOSC_TYPESIZE environment variable */
+  envvar = getenv("BLOSC_TYPESIZE");
+  if (envvar != NULL) {
+    long value;
+    value = strtol(envvar, NULL, 10);
+    if ((value != EINVAL) && (value > 0)) {
+      typesize = (int)value;
+    }
+  }
+
+  /* Check for a BLOSC_COMPRESSOR environment variable */
+  envvar = getenv("BLOSC_COMPRESSOR");
+  if (envvar != NULL) {
+    result = blosc_set_compressor(envvar);
+    if (result < 0) { return result; }
+  }
+
+  /* Check for a BLOSC_COMPRESSOR environment variable */
+  envvar = getenv("BLOSC_BLOCKSIZE");
+  if (envvar != NULL) {
+    long blocksize;
+    blocksize = strtol(envvar, NULL, 10);
+    if ((blocksize != EINVAL) && (blocksize > 0)) {
+      blosc_set_blocksize((size_t)blocksize);
+    }
+  }
+
+  /* Check for a BLOSC_NTHREADS environment variable */
+  envvar = getenv("BLOSC_NTHREADS");
+  if (envvar != NULL) {
+    long nthreads;
+    nthreads = strtol(envvar, NULL, 10);
+    if ((nthreads != EINVAL) && (nthreads > 0)) {
+      result = blosc_set_nthreads((int)nthreads);
+      if (result < 0) { return result; }
+    }
+  }
+
+  /* Check for a BLOSC_NOLOCK environment variable.  It is important
+     that this should be the last env var so that it can take the
+     previous ones into account */
+  envvar = getenv("BLOSC_NOLOCK");
+  if (envvar != NULL) {
+    char *compname;
+    blosc_compcode_to_compname(g_compressor, &compname);
+    result = blosc_compress_ctx(clevel, doshuffle, typesize,
+				nbytes, src, dest, destsize,
+				compname, g_force_blocksize, g_threads);
+    return result;
+  }
+
+  pthread_mutex_lock(&global_comp_mutex);
+
+  error = initialize_context_compression(g_global_context, clevel, doshuffle,
+					 typesize, nbytes, src, dest, destsize,
+					 g_compressor, g_force_blocksize,
+					 g_threads);
+  if (error < 0) { return error; }
+
+  error = write_compression_header(g_global_context, clevel, doshuffle);
+  if (error < 0) { return error; }
+
+  result = blosc_compress_context(g_global_context);
+
+  pthread_mutex_unlock(&global_comp_mutex);
+
+  return result;
+}
+
+int blosc_run_decompression_with_context(struct blosc_context* context,
+                                         const void* src,
+                                         void* dest,
+                                         size_t destsize,
+                                         int numinternalthreads)
+{
+  uint8_t version;
+  uint8_t versionlz;
+  uint32_t ctbytes;
+  int32_t ntbytes;
+
+  context->compress = 0;
+  context->src = (const uint8_t*)src;
+  context->dest = (uint8_t*)dest;
+  context->destsize = destsize;
+  context->num_output_bytes = 0;
+  context->numthreads = numinternalthreads;
+  context->end_threads = 0;
 
   /* Read the header block */
-  version = _src[0];                         /* blosc format version */
-  versionlz = _src[1];                       /* blosclz format version */
-  flags = _src[2];                           /* flags */
-  typesize = (int32_t)_src[3];              /* typesize */
-  _src += 4;
-  nbytes = sw32(((int32_t *)_src)[0]);      /* buffer size */
-  blocksize = sw32(((int32_t *)_src)[1]);   /* block size */
-  ctbytes = sw32(((int32_t *)_src)[2]);     /* compressed buffer size */
+  version = context->src[0];                        /* blosc format version */
+  versionlz = context->src[1];                      /* blosclz format version */
 
+  context->header_flags = (uint8_t*)(context->src + 2);           /* flags */
+  context->typesize = (int32_t)context->src[3];      /* typesize */
+  context->sourcesize = sw32_(context->src + 4);     /* buffer size */
+  context->blocksize = sw32_(context->src + 8);      /* block size */
+  ctbytes = sw32_(context->src + 12);               /* compressed buffer size */
+
+  /* Unused values */
   version += 0;                             /* shut up compiler warning */
   versionlz += 0;                           /* shut up compiler warning */
   ctbytes += 0;                             /* shut up compiler warning */
 
-  _src += sizeof(int32_t)*3;
-  bstarts = (int32_t *)_src;
+  context->bstarts = (uint8_t*)(context->src + 16);
   /* Compute some params */
   /* Total blocks */
-  nblocks = nbytes / blocksize;
-  leftover = nbytes % blocksize;
-  nblocks = (leftover>0)? nblocks+1: nblocks;
-  _src += sizeof(int32_t)*nblocks;
+  context->nblocks = context->sourcesize / context->blocksize;
+  context->leftover = context->sourcesize % context->blocksize;
+  context->nblocks = (context->leftover>0)? context->nblocks+1: context->nblocks;
 
   /* Check that we have enough space to decompress */
-  if (nbytes > (int32_t)destsize) {
+  if (context->sourcesize > (int32_t)destsize) {
     return -1;
   }
 
-  /* Take global lock for the time of decompression */
-  pthread_mutex_lock(&global_comp_mutex);
-  
-  /* Populate parameters for decompression routines */
-  params.compress = 0;
-  params.clevel = 0;            /* specific for compression */
-  params.flags = (int32_t)flags;
-  params.typesize = typesize;
-  params.blocksize = blocksize;
-  params.ntbytes = 0;
-  params.nbytes = nbytes;
-  params.nblocks = nblocks;
-  params.leftover = leftover;
-  params.bstarts = bstarts;
-  params.src = (uint8_t *)src;
-  params.dest = (uint8_t *)dest;
-
   /* Check whether this buffer is memcpy'ed */
-  if (flags & BLOSC_MEMCPYED) {
-    if (((nbytes % L1) == 0) || (nthreads > 1)) {
-      /* More effective with large buffers that are multiples of the
-       cache size or multi-cores */
-      ntbytes = do_job();
-      if (ntbytes < 0) {
-	return -1;
-      }
-    }
-    else {
-      memcpy(dest, (uint8_t *)src+BLOSC_MAX_OVERHEAD, nbytes);
-      ntbytes = nbytes;
-    }
+  if (*(context->header_flags) & BLOSC_MEMCPYED) {
+      memcpy(dest, (uint8_t *)src+BLOSC_MAX_OVERHEAD, context->sourcesize);
+      ntbytes = context->sourcesize;
   }
   else {
     /* Do the actual decompression */
-    ntbytes = do_job();
+    ntbytes = do_job(context);
     if (ntbytes < 0) {
       return -1;
     }
   }
-  /* Release global lock */
-  pthread_mutex_unlock(&global_comp_mutex);
-  
+
   assert(ntbytes <= (int32_t)destsize);
   return ntbytes;
 }
 
+/* The public routine for decompression with context. */
+int blosc_decompress_ctx(const void *src, void *dest, size_t destsize,
+                         int numinternalthreads)
+{
+  int result;
+  struct blosc_context context;
+
+  context.threads_started = 0;
+  result = blosc_run_decompression_with_context(&context, src, dest, destsize, numinternalthreads);
+
+  if (numinternalthreads > 1)
+  {
+    blosc_release_threadpool(&context);
+  }
+
+  return result;
+}
+
+
+/* The public routine for decompression.  See blosc.h for docstrings. */
+int blosc_decompress(const void *src, void *dest, size_t destsize)
+{
+  int result;
+  char* envvar;
+  long nthreads;
+
+  /* Check if should initialize */
+  if (!g_initlib) blosc_init();
+
+  /* Check for a BLOSC_NTHREADS environment variable */
+  envvar = getenv("BLOSC_NTHREADS");
+  if (envvar != NULL) {
+    nthreads = strtol(envvar, NULL, 10);
+    if ((nthreads != EINVAL) && (nthreads > 0)) {
+      result = blosc_set_nthreads((int)nthreads);
+      if (result < 0) { return result; }
+    }
+  }
+
+  /* Check for a BLOSC_NOLOCK environment variable.  It is important
+     that this should be the last env var so that it can take the
+     previous ones into account */
+  envvar = getenv("BLOSC_NOLOCK");
+  if (envvar != NULL) {
+    result = blosc_decompress_ctx(src, dest, destsize, g_threads);
+    return result;
+  }
+
+  pthread_mutex_lock(&global_comp_mutex);
+
+  result = blosc_run_decompression_with_context(g_global_context, src, dest,
+						destsize, g_threads);
+
+  pthread_mutex_unlock(&global_comp_mutex);
+
+  return result;
+}
+
 
 /* Specific routine optimized for decompression a small number of
    items out of a compressed chunk.  This does not use threads because
@@ -859,38 +1440,41 @@ int blosc_getitem(const void *src, int start, int nitems, void *dest)
   uint8_t version, versionlz;       /* versions for compressed header */
   uint8_t flags;                    /* flags for header */
   int32_t ntbytes = 0;              /* the number of uncompressed bytes */
-  int32_t nblocks;                 /* number of total blocks in buffer */
-  int32_t leftover;                /* extra bytes at end of buffer */
-  int32_t *bstarts;                /* start pointers for each block */
-  uint8_t *tmp = params.tmp[0];     /* tmp for thread 0 */
-  uint8_t *tmp2 = params.tmp2[0];   /* tmp2 for thread 0 */
+  int32_t nblocks;                  /* number of total blocks in buffer */
+  int32_t leftover;                 /* extra bytes at end of buffer */
+  uint8_t *bstarts;                 /* start pointers for each block */
   int tmp_init = 0;
   int32_t typesize, blocksize, nbytes, ctbytes;
   int32_t j, bsize, bsize2, leftoverblock;
   int32_t cbytes, startb, stopb;
   int stop = start + nitems;
+  uint8_t *tmp;
+  uint8_t *tmp2;
+  uint8_t *tmp3;
+  int32_t ebsize;
 
   _src = (uint8_t *)(src);
 
-  /* Take global lock  */
-  pthread_mutex_lock(&global_comp_mutex);
-  
   /* Read the header block */
-  version = _src[0];                         /* blosc format version */
-  versionlz = _src[1];                       /* blosclz format version */
-  flags = _src[2];                           /* flags */
+  version = _src[0];                        /* blosc format version */
+  versionlz = _src[1];                      /* blosclz format version */
+  flags = _src[2];                          /* flags */
   typesize = (int32_t)_src[3];              /* typesize */
-  _src += 4;
-  nbytes = sw32(((int32_t *)_src)[0]);      /* buffer size */
-  blocksize = sw32(((int32_t *)_src)[1]);   /* block size */
-  ctbytes = sw32(((int32_t *)_src)[2]);     /* compressed buffer size */
+  nbytes = sw32_(_src + 4);                 /* buffer size */
+  blocksize = sw32_(_src + 8);              /* block size */
+  ctbytes = sw32_(_src + 12);               /* compressed buffer size */
+
+  ebsize = blocksize + typesize * (int32_t)sizeof(int32_t);
+  tmp = my_malloc(blocksize + ebsize + blocksize);
+  tmp2 = tmp + blocksize;
+  tmp3 = tmp + blocksize + ebsize;
 
   version += 0;                             /* shut up compiler warning */
   versionlz += 0;                           /* shut up compiler warning */
   ctbytes += 0;                             /* shut up compiler warning */
 
-  _src += sizeof(int32_t)*3;
-  bstarts = (int32_t *)_src;
+  _src += 16;
+  bstarts = _src;
   /* Compute some params */
   /* Total blocks */
   nblocks = nbytes / blocksize;
@@ -901,29 +1485,12 @@ int blosc_getitem(const void *src, int start, int nitems, void *dest)
   /* Check region boundaries */
   if ((start < 0) || (start*typesize > nbytes)) {
     fprintf(stderr, "`start` out of bounds");
-    return (-1);
+    return -1;
   }
 
   if ((stop < 0) || (stop*typesize > nbytes)) {
     fprintf(stderr, "`start`+`nitems` out of bounds");
-    return (-1);
-  }
-
-  /* Parameters needed by blosc_d */
-  params.typesize = typesize;
-  params.flags = flags;
-
-  /* Initialize temporaries if needed */
-  if (tmp == NULL || tmp2 == NULL || current_temp.blocksize < blocksize) {
-    tmp = my_malloc(blocksize);
-    if (tmp == NULL) {
-      return -1;
-    }
-    tmp2 = my_malloc(blocksize);
-    if (tmp2 == NULL) {
-      return -1;
-    }
-    tmp_init = 1;
+    return -1;
   }
 
   for (j = 0; j < nblocks; j++) {
@@ -957,9 +1524,15 @@ int blosc_getitem(const void *src, int start, int nitems, void *dest)
       cbytes = bsize2;
     }
     else {
+      struct blosc_context context;
+      /* blosc_d only uses typesize and flags */
+      context.typesize = typesize;
+      context.header_flags = &flags;
+
       /* Regular decompression.  Put results in tmp2. */
-      cbytes = blosc_d(bsize, leftoverblock,
-                       (uint8_t *)src+sw32(bstarts[j]), tmp2, tmp, tmp2);
+      cbytes = blosc_d(&context, bsize, leftoverblock,
+                       (uint8_t *)src + sw32_(bstarts + j * 4),
+                       tmp2, tmp, tmp3);
       if (cbytes < 0) {
         ntbytes = cbytes;
         break;
@@ -970,23 +1543,17 @@ int blosc_getitem(const void *src, int start, int nitems, void *dest)
     }
     ntbytes += cbytes;
   }
-  
-  /* Release global lock */
-  pthread_mutex_unlock(&global_comp_mutex);
 
-  if (tmp_init) {
-    my_free(tmp);
-    my_free(tmp2);
-  }
+  my_free(tmp);
 
   return ntbytes;
 }
 
 
 /* Decompress & unshuffle several blocks in a single thread */
-static int t_blosc(void *tids)
+static void *t_blosc(void *ctxt)
 {
-  int32_t tid = *(int32_t *)tids;
+  struct thread_context* context = (struct thread_context*)ctxt;
   int32_t cbytes, ntdest;
   int32_t tblocks;              /* number of blocks per thread */
   int32_t leftover2;
@@ -1002,55 +1569,56 @@ static int t_blosc(void *tids)
   int32_t flags;
   int32_t nblocks;
   int32_t leftover;
-  int32_t *bstarts;
-  uint8_t *src;
+  uint8_t *bstarts;
+  const uint8_t *src;
   uint8_t *dest;
   uint8_t *tmp;
   uint8_t *tmp2;
+  uint8_t *tmp3;
+  int rc;
 
-  while (1) {
-
-    init_sentinels_done = 0;     /* sentinels have to be initialised yet */
-
+  while(1)
+  {
     /* Synchronization point for all threads (wait for initialization) */
-    WAIT_INIT;
+    WAIT_INIT(NULL, context->parent_context);
 
-    /* Check if thread has been asked to return */
-    if (end_threads) {
-      return(0);
+    if(context->parent_context->end_threads)
+    {
+      break;
     }
 
-    pthread_mutex_lock(&count_mutex);
-    if (!init_sentinels_done) {
-      /* Set sentinels and other global variables */
-      giveup_code = 1;            /* no error code initially */
-      nblock = -1;                /* block counter */
-      init_sentinels_done = 1;    /* sentinels have been initialised */
+    /* Get parameters for this thread before entering the main loop */
+    blocksize = context->parent_context->blocksize;
+    ebsize = blocksize + context->parent_context->typesize * (int32_t)sizeof(int32_t);
+    compress = context->parent_context->compress;
+    flags = *(context->parent_context->header_flags);
+    maxbytes = context->parent_context->destsize;
+    nblocks = context->parent_context->nblocks;
+    leftover = context->parent_context->leftover;
+    bstarts = context->parent_context->bstarts;
+    src = context->parent_context->src;
+    dest = context->parent_context->dest;
+
+    if (blocksize > context->tmpblocksize)
+    {
+      my_free(context->tmp);
+      context->tmp = my_malloc(blocksize + ebsize + blocksize);
+      context->tmp2 = context->tmp + blocksize;
+      context->tmp3 = context->tmp + blocksize + ebsize;
     }
-    pthread_mutex_unlock(&count_mutex);
 
-    /* Get parameters for this thread before entering the main loop */
-    blocksize = params.blocksize;
-    ebsize = blocksize + params.typesize*(int32_t)sizeof(int32_t);
-    compress = params.compress;
-    flags = params.flags;
-    maxbytes = params.maxbytes;
-    nblocks = params.nblocks;
-    leftover = params.leftover;
-    bstarts = params.bstarts;
-    src = params.src;
-    dest = params.dest;
-    tmp = params.tmp[tid];
-    tmp2 = params.tmp2[tid];
+    tmp = context->tmp;
+    tmp2 = context->tmp2;
+    tmp3 = context->tmp3;
 
     ntbytes = 0;                /* only useful for decompression */
 
     if (compress && !(flags & BLOSC_MEMCPYED)) {
       /* Compression always has to follow the block order */
-      pthread_mutex_lock(&count_mutex);
-      nblock++;
-      nblock_ = nblock;
-      pthread_mutex_unlock(&count_mutex);
+      pthread_mutex_lock(&context->parent_context->count_mutex);
+      context->parent_context->thread_nblock++;
+      nblock_ = context->parent_context->thread_nblock;
+      pthread_mutex_unlock(&context->parent_context->count_mutex);
       tblock = nblocks;
     }
     else {
@@ -1058,11 +1626,11 @@ static int t_blosc(void *tids)
        sequential block order on each thread */
 
       /* Blocks per thread */
-      tblocks = nblocks / nthreads;
-      leftover2 = nblocks % nthreads;
+      tblocks = nblocks / context->parent_context->numthreads;
+      leftover2 = nblocks % context->parent_context->numthreads;
       tblocks = (leftover2>0)? tblocks+1: tblocks;
 
-      nblock_ = tid*tblocks;
+      nblock_ = context->tid*tblocks;
       tblock = nblock_ + tblocks;
       if (tblock > nblocks) {
         tblock = nblocks;
@@ -1071,7 +1639,7 @@ static int t_blosc(void *tids)
 
     /* Loop over blocks */
     leftoverblock = 0;
-    while ((nblock_ < tblock) && giveup_code > 0) {
+    while ((nblock_ < tblock) && context->parent_context->thread_giveup_code > 0) {
       bsize = blocksize;
       if (nblock_ == (nblocks - 1) && (leftover > 0)) {
         bsize = leftover;
@@ -1086,8 +1654,8 @@ static int t_blosc(void *tids)
         }
         else {
           /* Regular compression */
-          cbytes = blosc_c(bsize, leftoverblock, 0, ebsize,
-                           src+nblock_*blocksize, tmp2, tmp);
+          cbytes = blosc_c(context->parent_context, bsize, leftoverblock, 0, ebsize,
+                           src+nblock_*blocksize, tmp2, tmp, tmp3);
         }
       }
       else {
@@ -1098,40 +1666,41 @@ static int t_blosc(void *tids)
           cbytes = bsize;
         }
         else {
-          cbytes = blosc_d(bsize, leftoverblock,
-                           src+sw32(bstarts[nblock_]), dest+nblock_*blocksize,
+          cbytes = blosc_d(context->parent_context, bsize, leftoverblock,
+                           src + sw32_(bstarts + nblock_ * 4),
+                           dest+nblock_*blocksize,
                            tmp, tmp2);
         }
       }
 
       /* Check whether current thread has to giveup */
-      if (giveup_code <= 0) {
+      if (context->parent_context->thread_giveup_code <= 0) {
         break;
       }
 
       /* Check results for the compressed/decompressed block */
       if (cbytes < 0) {            /* compr/decompr failure */
         /* Set giveup_code error */
-        pthread_mutex_lock(&count_mutex);
-        giveup_code = cbytes;
-        pthread_mutex_unlock(&count_mutex);
+        pthread_mutex_lock(&context->parent_context->count_mutex);
+        context->parent_context->thread_giveup_code = cbytes;
+        pthread_mutex_unlock(&context->parent_context->count_mutex);
         break;
       }
 
       if (compress && !(flags & BLOSC_MEMCPYED)) {
         /* Start critical section */
-        pthread_mutex_lock(&count_mutex);
-        ntdest = params.ntbytes;
-        bstarts[nblock_] = sw32(ntdest);    /* update block start counter */
-        if ( (cbytes == 0) || (ntdest+cbytes > (int32_t)maxbytes) ) {
-          giveup_code = 0;                  /* uncompressible buffer */
-          pthread_mutex_unlock(&count_mutex);
+        pthread_mutex_lock(&context->parent_context->count_mutex);
+        ntdest = context->parent_context->num_output_bytes;
+        _sw32(bstarts + nblock_ * 4, ntdest); /* update block start counter */
+        if ( (cbytes == 0) || (ntdest+cbytes > maxbytes) ) {
+          context->parent_context->thread_giveup_code = 0;  /* uncompressible buffer */
+          pthread_mutex_unlock(&context->parent_context->count_mutex);
           break;
         }
-        nblock++;
-        nblock_ = nblock;
-        params.ntbytes += cbytes;           /* update return bytes counter */
-        pthread_mutex_unlock(&count_mutex);
+        context->parent_context->thread_nblock++;
+        nblock_ = context->parent_context->thread_nblock;
+        context->parent_context->num_output_bytes += cbytes;           /* update return bytes counter */
+        pthread_mutex_unlock(&context->parent_context->count_mutex);
         /* End of critical section */
 
         /* Copy the compressed buffer to destination */
@@ -1146,56 +1715,74 @@ static int t_blosc(void *tids)
     } /* closes while (nblock_) */
 
     /* Sum up all the bytes decompressed */
-    if ((!compress || (flags & BLOSC_MEMCPYED)) && giveup_code > 0) {
+    if ((!compress || (flags & BLOSC_MEMCPYED)) && context->parent_context->thread_giveup_code > 0) {
       /* Update global counter for all threads (decompression only) */
-      pthread_mutex_lock(&count_mutex);
-      params.ntbytes += ntbytes;
-      pthread_mutex_unlock(&count_mutex);
+      pthread_mutex_lock(&context->parent_context->count_mutex);
+      context->parent_context->num_output_bytes += ntbytes;
+      pthread_mutex_unlock(&context->parent_context->count_mutex);
     }
 
     /* Meeting point for all threads (wait for finalization) */
-    WAIT_FINISH;
+    WAIT_FINISH(NULL, context->parent_context);
+  }
 
-  }  /* closes while(1) */
+  /* Cleanup our working space and context */
+  my_free(context->tmp);
+  my_free(context);
 
-  /* This should never be reached, but anyway */
-  return(0);
+  return(NULL);
 }
 
 
-static int init_threads(void)
+static int init_threads(struct blosc_context* context)
 {
   int32_t tid;
   int rc2;
+  int32_t ebsize;
+  struct thread_context* thread_context;
 
   /* Initialize mutex and condition variable objects */
-  pthread_mutex_init(&count_mutex, NULL);
+  pthread_mutex_init(&context->count_mutex, NULL);
+
+  /* Set context thread sentinels */
+  context->thread_giveup_code = 1;
+  context->thread_nblock = -1;
 
   /* Barrier initialization */
 #ifdef _POSIX_BARRIERS_MINE
-  pthread_barrier_init(&barr_init, NULL, nthreads+1);
-  pthread_barrier_init(&barr_finish, NULL, nthreads+1);
+  pthread_barrier_init(&context->barr_init, NULL, context->numthreads+1);
+  pthread_barrier_init(&context->barr_finish, NULL, context->numthreads+1);
 #else
-  pthread_mutex_init(&count_threads_mutex, NULL);
-  pthread_cond_init(&count_threads_cv, NULL);
-  count_threads = 0;      /* Reset threads counter */
+  pthread_mutex_init(&context->count_threads_mutex, NULL);
+  pthread_cond_init(&context->count_threads_cv, NULL);
+  context->count_threads = 0;      /* Reset threads counter */
 #endif
 
 #if !defined(_WIN32)
   /* Initialize and set thread detached attribute */
-  pthread_attr_init(&ct_attr);
-  pthread_attr_setdetachstate(&ct_attr, PTHREAD_CREATE_JOINABLE);
+  pthread_attr_init(&context->ct_attr);
+  pthread_attr_setdetachstate(&context->ct_attr, PTHREAD_CREATE_JOINABLE);
 #endif
 
   /* Finally, create the threads in detached state */
-  for (tid = 0; tid < nthreads; tid++) {
-    tids[tid] = tid;
+  for (tid = 0; tid < context->numthreads; tid++) {
+    context->tids[tid] = tid;
+
+    /* Create a thread context thread owns context (will destroy when finished) */
+    thread_context = (struct thread_context*)my_malloc(sizeof(struct thread_context));
+    thread_context->parent_context = context;
+    thread_context->tid = tid;
+
+    ebsize = context->blocksize + context->typesize * (int32_t)sizeof(int32_t);
+    thread_context->tmp = my_malloc(context->blocksize + ebsize + context->blocksize);
+    thread_context->tmp2 = thread_context->tmp + context->blocksize;
+    thread_context->tmp3 = thread_context->tmp + context->blocksize + ebsize;
+    thread_context->tmpblocksize = context->blocksize;
+
 #if !defined(_WIN32)
-    rc2 = pthread_create(&threads[tid], &ct_attr, (void*)t_blosc,
-			(void *)&tids[tid]);
+    rc2 = pthread_create(&context->threads[tid], &context->ct_attr, t_blosc, (void *)thread_context);
 #else
-    rc2 = pthread_create(&threads[tid], NULL, (void*)t_blosc,
-			(void *)&tids[tid]);
+    rc2 = pthread_create(&context->threads[tid], NULL, t_blosc, (void *)thread_context);
 #endif
     if (rc2) {
       fprintf(stderr, "ERROR; return code from pthread_create() is %d\n", rc2);
@@ -1204,147 +1791,161 @@ static int init_threads(void)
     }
   }
 
-  init_threads_done = 1;                 /* Initialization done! */
-  pid = (int)getpid();                   /* save the PID for this process */
 
   return(0);
 }
 
-void blosc_init(void) {
-  /* Init global lock  */
-  pthread_mutex_init(&global_comp_mutex, NULL);
-  init_lib = 1;
+int blosc_get_nthreads(void)
+{
+  int ret = g_threads;
+
+  return ret;
 }
 
-int blosc_set_nthreads(int nthreads_new) 
+int blosc_set_nthreads(int nthreads_new)
 {
-  int ret;
+  int ret = g_threads;
 
-  /* Check if should initialize (implementing previous 1.2.3 behaviour,
-     where calling blosc_set_nthreads was enough) */
-  if (!init_lib) blosc_init();
+  /* Check if should initialize */
+  if (!g_initlib) blosc_init();
+
+  if (nthreads_new != ret){
+    /* Re-initialize Blosc */
+    blosc_destroy();
+    blosc_init();
+    g_threads = nthreads_new;
+  }
 
-  /* Take global lock  */
-  pthread_mutex_lock(&global_comp_mutex);
-  
-  ret = blosc_set_nthreads_(nthreads_new);
-  /* Release global lock  */
-  pthread_mutex_unlock(&global_comp_mutex);
-  
   return ret;
 }
 
-int blosc_set_nthreads_(int nthreads_new)
+int blosc_set_nthreads_(struct blosc_context* context)
 {
-  int32_t nthreads_old = nthreads;
-  int32_t t;
-  int rc2;
-  void *status;
-
-  if (nthreads_new > BLOSC_MAX_THREADS) {
+  if (context->numthreads > BLOSC_MAX_THREADS) {
     fprintf(stderr,
             "Error.  nthreads cannot be larger than BLOSC_MAX_THREADS (%d)",
             BLOSC_MAX_THREADS);
     return -1;
   }
-  else if (nthreads_new <= 0) {
+  else if (context->numthreads <= 0) {
     fprintf(stderr, "Error.  nthreads must be a positive integer");
     return -1;
   }
 
-  /* Only join threads if they are not initialized or if our PID is
-     different from that in pid var (probably means that we are a
-     subprocess, and thus threads are non-existent). */
-  if (nthreads > 1 && init_threads_done && pid == getpid()) {
-      /* Tell all existing threads to finish */
-      end_threads = 1;
-      /* Synchronization point for all threads (wait for initialization) */
-      WAIT_INIT;
-      /* Join exiting threads */
-      for (t=0; t<nthreads; t++) {
-        rc2 = pthread_join(threads[t], &status);
-        if (rc2) {
-          fprintf(stderr, "ERROR; return code from pthread_join() is %d\n", rc2);
-          fprintf(stderr, "\tError detail: %s\n", strerror(rc2));
-          return(-1);
-        }
-      }
-      init_threads_done = 0;
-      end_threads = 0;
-    }
-
-  /* Launch a new pool of threads (if necessary) */
-  nthreads = nthreads_new;
-  if (nthreads > 1 && (!init_threads_done || pid != getpid())) {
-    init_threads();
+  /* Launch a new pool of threads */
+  if (context->numthreads > 1 && context->numthreads != context->threads_started) {
+    blosc_release_threadpool(context);
+    init_threads(context);
   }
 
-  return nthreads_old;
+  /* We have now started the threads */
+  context->threads_started = context->numthreads;
+
+  return context->numthreads;
 }
 
+char* blosc_get_compressor(void)
+{
+  char* compname;
+  blosc_compcode_to_compname(g_compressor, &compname);
 
-/* Free possible memory temporaries and thread resources */
-int blosc_free_resources(void)
+  return compname;
+}
+
+int blosc_set_compressor(const char *compname)
 {
-  int32_t t;
-  int rc2;
-  void *status;
- 
-   /* Take global lock  */
-  pthread_mutex_lock(&global_comp_mutex);
+  int code = blosc_compname_to_compcode(compname);
 
-  /* Release temporaries */
-  if (init_temps_done) {
-    release_temporaries();
-  }
+  g_compressor = code;
 
-  /* Finish the possible thread pool */
-  if (nthreads > 1 && init_threads_done) {
-    /* Tell all existing threads to finish */
-    end_threads = 1;
-    /* Synchronization point for all threads (wait for initialization) */
-    WAIT_INIT;
-    /* Join exiting threads */
-    for (t=0; t<nthreads; t++) {
-      rc2 = pthread_join(threads[t], &status);
-      if (rc2) {
-        fprintf(stderr, "ERROR; return code from pthread_join() is %d\n", rc2);
-        fprintf(stderr, "\tError detail: %s\n", strerror(rc2));
-        return(-1);
-      }
-    }
+  /* Check if should initialize */
+  if (!g_initlib) blosc_init();
 
-    /* Release mutex and condition variable objects */
-    pthread_mutex_destroy(&count_mutex);
+  return code;
+}
 
-    /* Barriers */
-#ifdef _POSIX_BARRIERS_MINE
-    pthread_barrier_destroy(&barr_init);
-    pthread_barrier_destroy(&barr_finish);
-#else
-    pthread_mutex_destroy(&count_threads_mutex);
-    pthread_cond_destroy(&count_threads_cv);
-#endif
+char* blosc_list_compressors(void)
+{
+  static int compressors_list_done = 0;
+  static char ret[256];
+
+  if (compressors_list_done) return ret;
+  ret[0] = '\0';
+  strcat(ret, BLOSC_BLOSCLZ_COMPNAME);
+#if defined(HAVE_LZ4)
+  strcat(ret, ","); strcat(ret, BLOSC_LZ4_COMPNAME);
+  strcat(ret, ","); strcat(ret, BLOSC_LZ4HC_COMPNAME);
+#endif /* HAVE_LZ4 */
+#if defined(HAVE_SNAPPY)
+  strcat(ret, ","); strcat(ret, BLOSC_SNAPPY_COMPNAME);
+#endif /* HAVE_SNAPPY */
+#if defined(HAVE_ZLIB)
+  strcat(ret, ","); strcat(ret, BLOSC_ZLIB_COMPNAME);
+#endif /* HAVE_ZLIB */
+#if defined(HAVE_ZSTD)
+  strcat(ret, ","); strcat(ret, BLOSC_ZSTD_COMPNAME);
+#endif /* HAVE_ZSTD */
+  compressors_list_done = 1;
+  return ret;
+}
 
-    /* Thread attributes */
-#if !defined(_WIN32)
-    pthread_attr_destroy(&ct_attr);
+char* blosc_get_version_string(void)
+{
+  static char ret[256];
+  strcpy(ret, BLOSC_VERSION_STRING);
+  return ret;
+}
+
+int blosc_get_complib_info(char *compname, char **complib, char **version)
+{
+  int clibcode;
+  char *clibname;
+  char *clibversion = "unknown";
+
+#if (defined(HAVE_LZ4) && defined(LZ4_VERSION_MAJOR)) || (defined(HAVE_SNAPPY) && defined(SNAPPY_VERSION)) || defined(ZSTD_VERSION_MAJOR)
+  char sbuffer[256];
 #endif
 
-    init_threads_done = 0;
-    end_threads = 0;
-  }
-   /* Release global lock  */
-  pthread_mutex_unlock(&global_comp_mutex);
-  return(0);
+  clibcode = compname_to_clibcode(compname);
+  clibname = clibcode_to_clibname(clibcode);
 
-}
+  /* complib version */
+  if (clibcode == BLOSC_BLOSCLZ_LIB) {
+    clibversion = BLOSCLZ_VERSION_STRING;
+  }
+#if defined(HAVE_LZ4)
+  else if (clibcode == BLOSC_LZ4_LIB) {
+#if defined(LZ4_VERSION_MAJOR)
+    sprintf(sbuffer, "%d.%d.%d",
+            LZ4_VERSION_MAJOR, LZ4_VERSION_MINOR, LZ4_VERSION_RELEASE);
+    clibversion = sbuffer;
+#endif /* LZ4_VERSION_MAJOR */
+  }
+#endif /* HAVE_LZ4 */
+#if defined(HAVE_SNAPPY)
+  else if (clibcode == BLOSC_SNAPPY_LIB) {
+#if defined(SNAPPY_VERSION)
+    sprintf(sbuffer, "%d.%d.%d", SNAPPY_MAJOR, SNAPPY_MINOR, SNAPPY_PATCHLEVEL);
+    clibversion = sbuffer;
+#endif /* SNAPPY_VERSION */
+  }
+#endif /* HAVE_SNAPPY */
+#if defined(HAVE_ZLIB)
+  else if (clibcode == BLOSC_ZLIB_LIB) {
+    clibversion = ZLIB_VERSION;
+  }
+#endif /* HAVE_ZLIB */
+#if defined(HAVE_ZSTD)
+  else if (clibcode == BLOSC_ZSTD_LIB) {
+    sprintf(sbuffer, "%d.%d.%d",
+            ZSTD_VERSION_MAJOR, ZSTD_VERSION_MINOR, ZSTD_VERSION_RELEASE);
+    clibversion = sbuffer;
+  }
+#endif /* HAVE_ZSTD */
 
-void blosc_destroy(void) {
-  /* Free the resources */
-  blosc_free_resources();
-  /* Destroy global lock */
-  pthread_mutex_destroy(&global_comp_mutex);
+  *complib = strdup(clibname);
+  *version = strdup(clibversion);
+  return clibcode;
 }
 
 /* Return `nbytes`, `cbytes` and `blocksize` from a compressed buffer. */
@@ -1355,17 +1956,16 @@ void blosc_cbuffer_sizes(const void *cbuffer, size_t *nbytes,
   uint8_t version, versionlz;              /* versions for compressed header */
 
   /* Read the version info (could be useful in the future) */
-  version = _src[0];                         /* blosc format version */
-  versionlz = _src[1];                       /* blosclz format version */
+  version = _src[0];                       /* blosc format version */
+  versionlz = _src[1];                     /* blosclz format version */
 
-  version += 0;                             /* shut up compiler warning */
-  versionlz += 0;                           /* shut up compiler warning */
+  version += 0;                            /* shut up compiler warning */
+  versionlz += 0;                          /* shut up compiler warning */
 
   /* Read the interesting values */
-  _src += 4;
-  *nbytes = (size_t)sw32(((int32_t *)_src)[0]);  /* uncompressed buffer size */
-  *blocksize = (size_t)sw32(((int32_t *)_src)[1]);   /* block size */
-  *cbytes = (size_t)sw32(((int32_t *)_src)[2]);  /* compressed buffer size */
+  *nbytes = (size_t)sw32_(_src + 4);       /* uncompressed buffer size */
+  *blocksize = (size_t)sw32_(_src + 8);    /* block size */
+  *cbytes = (size_t)sw32_(_src + 12);      /* compressed buffer size */
 }
 
 
@@ -1396,20 +1996,112 @@ void blosc_cbuffer_versions(const void *cbuffer, int *version,
   uint8_t *_src = (uint8_t *)(cbuffer);  /* current pos for source buffer */
 
   /* Read the version info */
-  *version = (int)_src[0];             /* blosc format version */
-  *versionlz = (int)_src[1];           /* blosclz format version */
+  *version = (int)_src[0];         /* blosc format version */
+  *versionlz = (int)_src[1];       /* Lempel-Ziv compressor format version */
+}
+
+
+/* Return the compressor library/format used in a compressed buffer. */
+char *blosc_cbuffer_complib(const void *cbuffer)
+{
+  uint8_t *_src = (uint8_t *)(cbuffer);  /* current pos for source buffer */
+  int clibcode;
+  char *complib;
+
+  /* Read the compressor format/library info */
+  clibcode = (_src[2] & 0xe0) >> 5;
+  complib = clibcode_to_clibname(clibcode);
+  return complib;
 }
 
+/* Get the internal blocksize to be used during compression.  0 means
+   that an automatic blocksize is computed internally. */
+int blosc_get_blocksize(void)
+{
+  return (int)g_force_blocksize;
+}
 
 /* Force the use of a specific blocksize.  If 0, an automatic
    blocksize will be used (the default). */
 void blosc_set_blocksize(size_t size)
 {
-  /* Take global lock  */
-  pthread_mutex_lock(&global_comp_mutex);
-  
-  force_blocksize = (int32_t)size;
-  
-   /* Release global lock  */
-  pthread_mutex_unlock(&global_comp_mutex);
+  g_force_blocksize = (int32_t)size;
+}
+
+void blosc_init(void)
+{
+  /* Return if we are already initialized */
+  if (g_initlib) return;
+
+  pthread_mutex_init(&global_comp_mutex, NULL);
+  g_global_context = (struct blosc_context*)my_malloc(sizeof(struct blosc_context));
+  g_global_context->threads_started = 0;
+  g_initlib = 1;
+}
+
+void blosc_destroy(void)
+{
+  /* Return if Blosc is not initialized */
+  if (!g_initlib) return;
+
+  g_initlib = 0;
+  blosc_release_threadpool(g_global_context);
+  my_free(g_global_context);
+  pthread_mutex_destroy(&global_comp_mutex);
+}
+
+int blosc_release_threadpool(struct blosc_context* context)
+{
+  int32_t t;
+  void* status;
+  int rc;
+  int rc2;
+
+  if (context->threads_started > 0)
+  {
+    /* Tell all existing threads to finish */
+    context->end_threads = 1;
+
+    /* Sync threads */
+    WAIT_INIT(-1, context);
+
+    /* Join exiting threads */
+    for (t=0; t<context->threads_started; t++) {
+      rc2 = pthread_join(context->threads[t], &status);
+      if (rc2) {
+        fprintf(stderr, "ERROR; return code from pthread_join() is %d\n", rc2);
+        fprintf(stderr, "\tError detail: %s\n", strerror(rc2));
+      }
+    }
+
+    /* Release mutex and condition variable objects */
+    pthread_mutex_destroy(&context->count_mutex);
+
+    /* Barriers */
+  #ifdef _POSIX_BARRIERS_MINE
+      pthread_barrier_destroy(&context->barr_init);
+      pthread_barrier_destroy(&context->barr_finish);
+  #else
+      pthread_mutex_destroy(&context->count_threads_mutex);
+      pthread_cond_destroy(&context->count_threads_cv);
+  #endif
+
+      /* Thread attributes */
+  #if !defined(_WIN32)
+      pthread_attr_destroy(&context->ct_attr);
+  #endif
+
+  }
+
+  context->threads_started = 0;
+
+  return 0;
+}
+
+int blosc_free_resources(void)
+{
+  /* Return if Blosc is not initialized */
+  if (!g_initlib) return -1;
+
+  return blosc_release_threadpool(g_global_context);
 }
diff --git a/thirdparty/blosc/blosc.h b/thirdparty/blosc/blosc.h
index 6a7129a72619b100c71a4ad801d28c218af3e318..73b7c9e88bd95b649f75d6f17afd67c5a027f594 100644
--- a/thirdparty/blosc/blosc.h
+++ b/thirdparty/blosc/blosc.h
@@ -1,31 +1,34 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
-  Author: Francesc Alted <faltet@gmail.com>
+  Author: Francesc Alted <francesc@blosc.org>
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
 **********************************************************************/
+#ifndef BLOSC_H
+#define BLOSC_H
 
 #include <limits.h>
+#include <stdlib.h>
+#include "blosc-export.h"
 
-#ifndef BLOSC_H
-#define BLOSC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /* Version numbers */
 #define BLOSC_VERSION_MAJOR    1    /* for major interface/format changes  */
-#define BLOSC_VERSION_MINOR    2    /* for minor interface/format changes  */
-#define BLOSC_VERSION_RELEASE  3    /* for tweaks, bug-fixes, or development */
+#define BLOSC_VERSION_MINOR    10   /* for minor interface/format changes  */
+#define BLOSC_VERSION_RELEASE  1    /* for tweaks, bug-fixes, or development */
 
-#define BLOSC_VERSION_STRING   "1.2.3"  /* string version.  Sync with above! */
+#define BLOSC_VERSION_STRING   "1.10.1.dev"  /* string version.  Sync with above! */
 #define BLOSC_VERSION_REVISION "$Rev$"   /* revision version */
-#define BLOSC_VERSION_DATE     "$Date:: 2013-05-17 #$"    /* date version */
+#define BLOSC_VERSION_DATE     "$Date:: 2016-07-20 #$"    /* date version */
 
-/* The *_VERS_FORMAT should be just 1-byte long */
-#define BLOSC_VERSION_FORMAT    2   /* Blosc format version, starting at 1 */
-#define BLOSCLZ_VERSION_FORMAT  1   /* Blosclz format version, starting at 1 */
+#define BLOSCLZ_VERSION_STRING "1.0.5"   /* the internal compressor version */
 
-/* The combined blosc and blosclz formats */
-#define BLOSC_VERSION_CFORMAT (BLOSC_VERSION_FORMAT << 8) & (BLOSCLZ_VERSION_FORMAT)
+/* The *_FORMAT symbols should be just 1-byte long */
+#define BLOSC_VERSION_FORMAT    2   /* Blosc format version, starting at 1 */
 
 /* Minimum header length */
 #define BLOSC_MIN_HEADER_LENGTH 16
@@ -35,63 +38,122 @@
    implementations */
 #define BLOSC_MAX_OVERHEAD BLOSC_MIN_HEADER_LENGTH
 
-/* Maximum buffer size to be compressed */
+/* Maximum source buffer size to be compressed */
 #define BLOSC_MAX_BUFFERSIZE (INT_MAX - BLOSC_MAX_OVERHEAD)
 
-/* Maximum typesize before considering buffer as a stream of bytes */
+/* Maximum typesize before considering source buffer as a stream of bytes */
 #define BLOSC_MAX_TYPESIZE 255         /* Cannot be larger than 255 */
 
 /* The maximum number of threads (for some static arrays) */
 #define BLOSC_MAX_THREADS 256
 
-/* Codes for internal flags (see blosc_cbuffer_metainfo) */
-#define BLOSC_DOSHUFFLE 0x1
-#define BLOSC_MEMCPYED  0x2
+/* Codes for shuffling (see blosc_compress) */
+#define BLOSC_NOSHUFFLE   0  /* no shuffle */
+#define BLOSC_SHUFFLE     1  /* byte-wise shuffle */
+#define BLOSC_BITSHUFFLE  2  /* bit-wise shuffle */
 
+/* Codes for internal flags (see blosc_cbuffer_metainfo) */
+#define BLOSC_DOSHUFFLE    0x1	/* byte-wise shuffle */
+#define BLOSC_MEMCPYED     0x2	/* plain copy */
+#define BLOSC_DOBITSHUFFLE 0x4  /* bit-wise shuffle */
+
+/* Codes for the different compressors shipped with Blosc */
+#define BLOSC_BLOSCLZ   0
+#define BLOSC_LZ4       1
+#define BLOSC_LZ4HC     2
+#define BLOSC_SNAPPY    3
+#define BLOSC_ZLIB      4
+#define BLOSC_ZSTD      5
+
+/* Names for the different compressors shipped with Blosc */
+#define BLOSC_BLOSCLZ_COMPNAME   "blosclz"
+#define BLOSC_LZ4_COMPNAME       "lz4"
+#define BLOSC_LZ4HC_COMPNAME     "lz4hc"
+#define BLOSC_SNAPPY_COMPNAME    "snappy"
+#define BLOSC_ZLIB_COMPNAME      "zlib"
+#define BLOSC_ZSTD_COMPNAME      "zstd"
+
+/* Codes for compression libraries shipped with Blosc (code must be < 8) */
+#define BLOSC_BLOSCLZ_LIB   0
+#define BLOSC_LZ4_LIB       1
+#define BLOSC_SNAPPY_LIB    2
+#define BLOSC_ZLIB_LIB      3
+#define BLOSC_ZSTD_LIB      4
+
+/* Names for the different compression libraries shipped with Blosc */
+#define BLOSC_BLOSCLZ_LIBNAME   "BloscLZ"
+#define BLOSC_LZ4_LIBNAME       "LZ4"
+#define BLOSC_SNAPPY_LIBNAME    "Snappy"
+#define BLOSC_ZLIB_LIBNAME      "Zlib"
+#define BLOSC_ZSTD_LIBNAME      "Zstd"
+
+/* The codes for compressor formats shipped with Blosc */
+#define BLOSC_BLOSCLZ_FORMAT  BLOSC_BLOSCLZ_LIB
+#define BLOSC_LZ4_FORMAT      BLOSC_LZ4_LIB
+#define BLOSC_LZ4HC_FORMAT    BLOSC_LZ4_LIB /* LZ4HC and LZ4 share the same format */
+#define BLOSC_SNAPPY_FORMAT   BLOSC_SNAPPY_LIB
+#define BLOSC_ZLIB_FORMAT     BLOSC_ZLIB_LIB
+#define BLOSC_ZSTD_FORMAT     BLOSC_ZSTD_LIB
+
+
+/* The version formats for compressors shipped with Blosc */
+/* All versions here starts at 1 */
+#define BLOSC_BLOSCLZ_VERSION_FORMAT  1
+#define BLOSC_LZ4_VERSION_FORMAT      1
+#define BLOSC_LZ4HC_VERSION_FORMAT    1  /* LZ4HC and LZ4 share the same format */
+#define BLOSC_SNAPPY_VERSION_FORMAT   1
+#define BLOSC_ZLIB_VERSION_FORMAT     1
+#define BLOSC_ZSTD_VERSION_FORMAT     1
 
 
 /**
-  Initialize the Blosc library. You must call this previous to any other
-  Blosc call, and make sure that you call this in a non-threaded environment.
-  Other Blosc calls can be called in a threaded environment, if desired.
-
- */
+  Initialize the Blosc library environment.
 
-void blosc_init(void);
+  You must call this previous to any other Blosc call, unless you want
+  Blosc to be used simultaneously in a multi-threaded environment, in
+  which case you should *exclusively* use the
+  blosc_compress_ctx()/blosc_decompress_ctx() pair (see below).
+  */
+BLOSC_EXPORT void blosc_init(void);
 
 
 /**
+  Destroy the Blosc library environment.
 
-  Destroy the Blosc library environment. You must call this after to you are
-  done with all the Blosc calls, and make sure that you call this in a
-  non-threaded environment.
-
- */
-
-void blosc_destroy(void);
+  You must call this after to you are done with all the Blosc calls,
+  unless you have not used blosc_init() before (see blosc_init()
+  above).
+  */
+BLOSC_EXPORT void blosc_destroy(void);
 
 
 /**
   Compress a block of data in the `src` buffer and returns the size of
-  compressed block.  The size of `src` buffer is specified by
+  the compressed block.  The size of `src` buffer is specified by
   `nbytes`.  There is not a minimum for `src` buffer size (`nbytes`).
 
   `clevel` is the desired compression level and must be a number
   between 0 (no compression) and 9 (maximum compression).
 
   `doshuffle` specifies whether the shuffle compression preconditioner
-  should be applied or not.  0 means not applying it and 1 means
-  applying it.
+  should be applied or not.  BLOSC_NOSHUFFLE means not applying it,
+  BLOSC_SHUFFLE means applying it at a byte level and BLOSC_BITSHUFFLE
+  at a bit level (slower but may achieve better entropy alignment).
 
   `typesize` is the number of bytes for the atomic type in binary
   `src` buffer.  This is mainly useful for the shuffle preconditioner.
-  Only a typesize > 1 will allow the shuffle to work.
+  For implementation reasons, only a 1 < typesize < 256 will allow the
+  shuffle filter to work.  When typesize is not in this range, shuffle
+  will be silently disabled.
 
   The `dest` buffer must have at least the size of `destsize`.  Blosc
   guarantees that if you set `destsize` to, at least,
   (`nbytes`+BLOSC_MAX_OVERHEAD), the compression will always succeed.
   The `src` buffer and the `dest` buffer can not overlap.
 
+  Compression is memory safe and guaranteed not to write the `dest`
+  buffer more than what is specified in `destsize`.
+
   If `src` buffer cannot be compressed into `destsize`, the return
   value is zero and you should discard the contents of the `dest`
   buffer.
@@ -100,61 +162,234 @@ void blosc_destroy(void);
   should never happen.  If you see this, please report it back
   together with the buffer data causing this and compression settings.
 
-  Compression is memory safe and guaranteed not to write the `dest`
-  buffer more than what is specified in `destsize`.  However, it is
-  not re-entrant and not thread-safe (despite the fact that it uses
-  threads internally).
- */
+  Environment variables
+  ---------------------
+
+  blosc_compress() honors different environment variables to control
+  internal parameters without the need of doing that programatically.
+  Here are the ones supported:
+
+  BLOSC_CLEVEL=(INTEGER): This will overwrite the `clevel` parameter
+  before the compression process starts.
+
+  BLOSC_SHUFFLE=[NOSHUFFLE | SHUFFLE | BITSHUFFLE]: This will
+  overwrite the `doshuffle` parameter before the compression process
+  starts.
+
+  BLOSC_TYPESIZE=(INTEGER): This will overwrite the `typesize`
+  parameter before the compression process starts.
+
+  BLOSC_COMPRESSOR=[BLOSCLZ | LZ4 | LZ4HC | SNAPPY | ZLIB]: This will
+  call blosc_set_compressor(BLOSC_COMPRESSOR) before the compression
+  process starts.
+
+  BLOSC_NTHREADS=(INTEGER): This will call
+  blosc_set_nthreads(BLOSC_NTHREADS) before the compression process
+  starts.
+
+  BLOSC_BLOCKSIZE=(INTEGER): This will call
+  blosc_set_blocksize(BLOSC_BLOCKSIZE) before the compression process
+  starts.  *NOTE:* The blocksize is a critical parameter with
+  important restrictions in the allowed values, so use this with care.
 
-int blosc_compress(int clevel, int doshuffle, size_t typesize, size_t nbytes,
-                   const void *src, void *dest, size_t destsize);
+  BLOSC_NOLOCK=(ANY VALUE): This will call blosc_compress_ctx() under
+  the hood, with the `compressor`, `blocksize` and
+  `numinternalthreads` parameters set to the same as the last calls to
+  blosc_set_compressor(), blosc_set_blocksize() and
+  blosc_set_nthreads().  BLOSC_CLEVEL, BLOSC_SHUFFLE, BLOSC_TYPESIZE
+  environment vars will also be honored.
+  */
+BLOSC_EXPORT int blosc_compress(int clevel, int doshuffle, size_t typesize,
+				size_t nbytes, const void *src, void *dest,
+				size_t destsize);
 
 
+/**
+  Context interface to blosc compression. This does not require a call
+  to blosc_init() and can be called from multithreaded applications
+  without the global lock being used, so allowing Blosc be executed
+  simultaneously in those scenarios.
+
+  It uses the same parameters than the blosc_compress() function plus:
+
+  `compressor`: the string representing the type of compressor to use.
+
+  `blocksize`: the requested size of the compressed blocks.  If 0, an
+   automatic blocksize will be used.
+
+  `numinternalthreads`: the number of threads to use internally.
+
+  A negative return value means that an internal error happened.  This
+  should never happen.  If you see this, please report it back
+  together with the buffer data causing this and compression settings.
+*/
+BLOSC_EXPORT int blosc_compress_ctx(int clevel, int doshuffle, size_t typesize,
+                                    size_t nbytes, const void* src, void* dest,
+                                    size_t destsize, const char* compressor,
+                                    size_t blocksize, int numinternalthreads);
+
 /**
   Decompress a block of compressed data in `src`, put the result in
-  `dest` and returns the size of the decompressed block. If error
-  occurs, e.g. the compressed data is corrupted or the output buffer
-  is not large enough, then 0 (zero) or a negative value will be
-  returned instead.
+  `dest` and returns the size of the decompressed block.
 
   The `src` buffer and the `dest` buffer can not overlap.
 
   Decompression is memory safe and guaranteed not to write the `dest`
-  buffer more than what is specified in `destsize`.  However, it is
-  not re-entrant and not thread-safe (despite the fact that it uses
-  threads internally).
+  buffer more than what is specified in `destsize`.
+
+  If an error occurs, e.g. the compressed data is corrupted or the
+  output buffer is not large enough, then 0 (zero) or a negative value
+  will be returned instead.
+
+  Environment variables
+  ---------------------
+
+  blosc_decompress() honors different environment variables to control
+  internal parameters without the need of doing that programatically.
+  Here are the ones supported:
+
+  BLOSC_NTHREADS=(INTEGER): This will call
+  blosc_set_nthreads(BLOSC_NTHREADS) before the proper decompression
+  process starts.
+
+  BLOSC_NOLOCK=(ANY VALUE): This will call blosc_decompress_ctx()
+  under the hood, with the `numinternalthreads` parameter set to the
+  same value as the last call to blosc_set_nthreads().
 */
+BLOSC_EXPORT int blosc_decompress(const void *src, void *dest, size_t destsize);
+
+
+/**
+  Context interface to blosc decompression. This does not require a
+  call to blosc_init() and can be called from multithreaded
+  applications without the global lock being used, so allowing Blosc
+  be executed simultaneously in those scenarios.
+
+  It uses the same parameters than the blosc_decompress() function plus:
+
+  `numinternalthreads`: number of threads to use internally.
 
-int blosc_decompress(const void *src, void *dest, size_t destsize);
+  Decompression is memory safe and guaranteed not to write the `dest`
+  buffer more than what is specified in `destsize`.
 
+  If an error occurs, e.g. the compressed data is corrupted or the
+  output buffer is not large enough, then 0 (zero) or a negative value
+  will be returned instead.
+*/
+BLOSC_EXPORT int blosc_decompress_ctx(const void *src, void *dest,
+                                      size_t destsize, int numinternalthreads);
 
 /**
   Get `nitems` (of typesize size) in `src` buffer starting in `start`.
   The items are returned in `dest` buffer, which has to have enough
-  space for storing all items.  Returns the number of bytes copied to
-  `dest` or a negative value if some error happens.
- */
+  space for storing all items.
 
-int blosc_getitem(const void *src, int start, int nitems, void *dest);
+  Returns the number of bytes copied to `dest` or a negative value if
+  some error happens.
+  */
+BLOSC_EXPORT int blosc_getitem(const void *src, int start, int nitems, void *dest);
+
+
+/**
+  Returns the current number of threads that are used for
+  compression/decompression.
+  */
+BLOSC_EXPORT int blosc_get_nthreads(void);
 
 
 /**
   Initialize a pool of threads for compression/decompression.  If
   `nthreads` is 1, then the serial version is chosen and a possible
-  previous existing pool is ended.  Returns the previous number of
-  threads.  If this is not called, `nthreads` is set to 1 internally.
-*/
+  previous existing pool is ended.  If this is not called, `nthreads`
+  is set to 1 internally.
+
+  Returns the previous number of threads.
+  */
+BLOSC_EXPORT int blosc_set_nthreads(int nthreads);
+
+
+/**
+  Returns the current compressor that is used for compression.
+  */
+BLOSC_EXPORT char* blosc_get_compressor(void);
+
+
+/**
+  Select the compressor to be used.  The supported ones are "blosclz",
+  "lz4", "lz4hc", "snappy", "zlib" and "ztsd".  If this function is not
+  called, then "blosclz" will be used.
+
+  In case the compressor is not recognized, or there is not support
+  for it in this build, it returns a -1.  Else it returns the code for
+  the compressor (>=0).
+  */
+BLOSC_EXPORT int blosc_set_compressor(const char* compname);
+
+
+/**
+  Get the `compname` associated with the `compcode`.
+
+  If the compressor code is not recognized, or there is not support
+  for it in this build, -1 is returned.  Else, the compressor code is
+  returned.
+ */
+BLOSC_EXPORT int blosc_compcode_to_compname(int compcode, char **compname);
+
+
+/**
+  Return the compressor code associated with the compressor name.
+
+  If the compressor name is not recognized, or there is not support
+  for it in this build, -1 is returned instead.
+ */
+BLOSC_EXPORT int blosc_compname_to_compcode(const char *compname);
+
+
+/**
+  Get a list of compressors supported in the current build.  The
+  returned value is a string with a concatenation of "blosclz", "lz4",
+  "lz4hc", "snappy", "zlib" or "zstd "separated by commas, depending
+  on which ones are present in the build.
 
-int blosc_set_nthreads(int nthreads);
+  This function does not leak, so you should not free() the returned
+  list.
 
+  This function should always succeed.
+  */
+BLOSC_EXPORT char* blosc_list_compressors(void);
 
 /**
-  Free possible memory temporaries and thread resources.  Use this when you
-  are not going to use Blosc for a long while.  In case of problems releasing
-  the resources, it returns a negative number, else it returns 0.
+  Return the version of blosc in string format.
+
+  Useful for dynamic libraries.
 */
+BLOSC_EXPORT char* blosc_get_version_string(void);
+
+
+/**
+  Get info from compression libraries included in the current build.
+  In `compname` you pass the compressor name that you want info from.
+  In `complib` and `version` you get the compression library name and
+  version (if available) as output.
 
-int blosc_free_resources(void);
+  In `complib` and `version` you get a pointer to the compressor
+  library name and the version in string format respectively.  After
+  using the name and version, you should free() them so as to avoid
+  leaks.
+
+  If the compressor is supported, it returns the code for the library
+  (>=0).  If it is not supported, this function returns -1.
+  */
+BLOSC_EXPORT int blosc_get_complib_info(char *compname, char **complib, char **version);
+
+
+/**
+  Free possible memory temporaries and thread resources.  Use this
+  when you are not going to use Blosc for a long while.  In case of
+  problems releasing the resources, it returns a negative number, else
+  it returns 0.
+  */
+BLOSC_EXPORT int blosc_free_resources(void);
 
 
 /**
@@ -167,10 +402,9 @@ int blosc_free_resources(void);
   compressed buffer for this call to work.
 
   This function should always succeed.
-*/
-
-void blosc_cbuffer_sizes(const void *cbuffer, size_t *nbytes,
-                         size_t *cbytes, size_t *blocksize);
+  */
+BLOSC_EXPORT void blosc_cbuffer_sizes(const void *cbuffer, size_t *nbytes,
+				      size_t *cbytes, size_t *blocksize);
 
 
 /**
@@ -181,26 +415,34 @@ void blosc_cbuffer_sizes(const void *cbuffer, size_t *nbytes,
     * bit 0: whether the shuffle filter has been applied or not
     * bit 1: whether the internal buffer is a pure memcpy or not
 
-  You can use the `BLOSC_DOSHUFFLE` and `BLOSC_MEMCPYED` symbols for
-  extracting the interesting bits (e.g. ``flags & BLOSC_DOSHUFFLE``
-  says whether the buffer is shuffled or not).
+  You can use the `BLOSC_DOSHUFFLE`, `BLOSC_DOBITSHUFFLE` and
+  `BLOSC_MEMCPYED` symbols for extracting the interesting bits
+  (e.g. ``flags & BLOSC_DOSHUFFLE`` says whether the buffer is
+  byte-shuffled or not).
 
   This function should always succeed.
-*/
-
-void blosc_cbuffer_metainfo(const void *cbuffer, size_t *typesize,
-                            int *flags);
+  */
+BLOSC_EXPORT void blosc_cbuffer_metainfo(const void *cbuffer, size_t *typesize,
+					 int *flags);
 
 
 /**
   Return information about a compressed buffer, namely the internal
   Blosc format version (`version`) and the format for the internal
-  Lempel-Ziv algorithm (`versionlz`).  This function should always
-  succeed.
-*/
+  Lempel-Ziv compressor used (`versionlz`).
+
+  This function should always succeed.
+  */
+BLOSC_EXPORT void blosc_cbuffer_versions(const void *cbuffer, int *version,
+                                             int *versionlz);
+
 
-void blosc_cbuffer_versions(const void *cbuffer, int *version,
-                            int *versionlz);
+/**
+  Return the compressor library/format used in a compressed buffer.
+
+  This function should always succeed.
+  */
+BLOSC_EXPORT char *blosc_cbuffer_complib(const void *cbuffer);
 
 
 
@@ -210,13 +452,22 @@ void blosc_cbuffer_versions(const void *cbuffer, int *version,
 
 *********************************************************************/
 
+/* Get the internal blocksize to be used during compression.  0 means
+   that an automatic blocksize is computed internally. */
+BLOSC_EXPORT int blosc_get_blocksize(void);
 
 /**
   Force the use of a specific blocksize.  If 0, an automatic
   blocksize will be used (the default).
-*/
 
-void blosc_set_blocksize(size_t blocksize);
+  The blocksize is a critical parameter with important restrictions in
+  the allowed values, so use this with care.
+  */
+BLOSC_EXPORT void blosc_set_blocksize(size_t blocksize);
+
+#ifdef __cplusplus
+}
+#endif
 
 
 #endif
diff --git a/thirdparty/blosc/blosclz.c b/thirdparty/blosc/blosclz.c
index 5f0ac5ff17defb0cb16dc975dca5459fa5735cc1..d5be92b63fdc605defe13f04b25c6c05e4240076 100644
--- a/thirdparty/blosc/blosclz.c
+++ b/thirdparty/blosc/blosclz.c
@@ -1,7 +1,7 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
-  Author: Francesc Alted <faltet@gmail.com>
+  Author: Francesc Alted <francesc@blosc.org>
   Creation date: 2009-05-20
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
@@ -20,7 +20,17 @@
 
 #if defined(_WIN32) && !defined(__MINGW32__)
   #include <windows.h>
-  #include "win32/stdint-windows.h"
+
+  /* stdint.h only available in VS2010 (VC++ 16.0) and newer */
+  #if defined(_MSC_VER) && _MSC_VER < 1600
+    #include "win32/stdint-windows.h"
+  #else
+    #include <stdint.h>
+  #endif
+  /* llabs only available in VS2013 (VC++ 18.0) and newer */
+  #if defined(_MSC_VER) && _MSC_VER < 1800
+    #define llabs(v) abs(v)
+  #endif
 #else
   #include <stdint.h>
 #endif  /* _WIN32 */
@@ -35,7 +45,7 @@
 #undef BLOSCLZ_STRICT_ALIGN
 #elif defined(__i486__) || defined(__i586__) || defined(__i686__)  /* GNU C */
 #undef BLOSCLZ_STRICT_ALIGN
-#elif defined(_M_IX86) /* Intel, MSVC */
+#elif defined(_M_IX86) || defined(_M_X64)   /* Intel, MSVC */
 #undef BLOSCLZ_STRICT_ALIGN
 #elif defined(__386)
 #undef BLOSCLZ_STRICT_ALIGN
@@ -43,6 +53,11 @@
 #undef BLOSCLZ_STRICT_ALIGN
 #elif defined(__I86__) /* Digital Mars */
 #undef BLOSCLZ_STRICT_ALIGN
+/* Seems like unaligned access in ARM (at least ARMv6) is pretty
+   expensive, so we are going to always enforce strict aligment in ARM.
+   If anybody suggest that newer ARMs are better, we can revisit this. */
+/* #elif defined(__ARM_FEATURE_UNALIGNED) */  /* ARM, GNU C */
+/* #undef BLOSCLZ_STRICT_ALIGN */
 #endif
 #endif
 
@@ -66,16 +81,11 @@
 /*
  * Use inlined functions for supported systems.
  */
-#if defined(__GNUC__) || defined(__DMC__) || defined(__POCC__) || defined(__WATCOMC__) || defined(__SUNPRO_C)
-#define BLOSCLZ_INLINE inline
-#elif defined(__BORLANDC__) || defined(_MSC_VER) || defined(__LCC__)
-#define BLOSCLZ_INLINE __inline
-#else
-#define BLOSCLZ_INLINE
+#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
+#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
 #endif
 
 #define MAX_COPY       32
-#define MAX_LEN       264  /* 256 + 8 */
 #define MAX_DISTANCE 8191
 #define MAX_FARDISTANCE (65535+MAX_DISTANCE-1)
 
@@ -86,21 +96,85 @@
 #endif
 
 
-static BLOSCLZ_INLINE int32_t hash_function(uint8_t* p, uint8_t hash_log)
-{
-  int32_t v;
+/*
+ * Fast copy macros
+ */
+#if defined(_WIN32)
+  #define CPYSIZE              32
+#else
+  #define CPYSIZE              8
+#endif
+#define MCPY(d,s)            { memcpy(d, s, CPYSIZE); d+=CPYSIZE; s+=CPYSIZE; }
+#define FASTCOPY(d,s,e)      { do { MCPY(d,s) } while (d<e); }
+#define SAFECOPY(d,s,e)      { while (d<e) { MCPY(d,s) } }
+
+/* Copy optimized for copying in blocks */
+#define BLOCK_COPY(op, ref, len, op_limit)    \
+{ int ilen = len % CPYSIZE;                   \
+  uint8_t *cpy = op + len;                    \
+  if (cpy + CPYSIZE - ilen <= op_limit) {     \
+    FASTCOPY(op, ref, cpy);                   \
+    ref -= (op-cpy); op = cpy;                \
+  }                                           \
+  else {                                      \
+    cpy -= ilen;                              \
+    SAFECOPY(op, ref, cpy);                   \
+    ref -= (op-cpy); op = cpy;                \
+    for(; ilen; --ilen)	                      \
+        *op++ = *ref++;                       \
+  }                                           \
+}
 
-  v = BLOSCLZ_READU16(p);
-  v ^= BLOSCLZ_READU16(p+1)^(v>>(16-hash_log));
-  v &= (1 << hash_log) - 1;
-  return v;
+#define SAFE_COPY(op, ref, len, op_limit)     \
+if (llabs(op-ref) < CPYSIZE) {                \
+  for(; len; --len)                           \
+    *op++ = *ref++;                           \
+}                                             \
+else BLOCK_COPY(op, ref, len, op_limit);
+
+/* Copy optimized for GCC 4.8.  Seems like long copy loops are optimal. */
+#define GCC_SAFE_COPY(op, ref, len, op_limit) \
+if ((len > 32) || (llabs(op-ref) < CPYSIZE)) { \
+  for(; len; --len)                           \
+    *op++ = *ref++;                           \
+}                                             \
+else BLOCK_COPY(op, ref, len, op_limit);
+
+/* Simple, but pretty effective hash function for 3-byte sequence */
+#define HASH_FUNCTION(v, p, l) {                       \
+    v = BLOSCLZ_READU16(p);                            \
+    v ^= BLOSCLZ_READU16(p + 1) ^ ( v >> (16 - l));    \
+    v &= (1 << l) - 1;                                 \
 }
 
+/* Another version which seems to be a bit more effective than the above,
+ * but a bit slower.  Could be interesting for high opt_level.
+ */
+#define MINMATCH 3
+#define HASH_FUNCTION2(v, p, l) {                       \
+  v = BLOSCLZ_READU16(p);				\
+  v = (v * 2654435761U) >> ((MINMATCH * 8) - (l + 1));  \
+  v &= (1 << l) - 1;					\
+}
+
+#define LITERAL(ip, op, op_limit, anchor, copy) {        \
+  if (BLOSCLZ_UNEXPECT_CONDITIONAL(op+2 > op_limit))     \
+    goto out;                                            \
+  *op++ = *anchor++;                                     \
+  ip = anchor;                                           \
+  copy++;                                                \
+  if(BLOSCLZ_UNEXPECT_CONDITIONAL(copy == MAX_COPY)) {   \
+    copy = 0;                                            \
+    *op++ = MAX_COPY-1;                                  \
+  }                                                      \
+  continue;                                              \
+}
 
 #define IP_BOUNDARY 2
 
-int blosclz_compress(int opt_level, const void* input,
-                     int length, void* output, int maxout)
+
+int blosclz_compress(const int opt_level, const void* input, int length,
+                     void* output, int maxout, int accel)
 {
   uint8_t* ip = (uint8_t*) input;
   uint8_t* ibase = (uint8_t*) input;
@@ -109,48 +183,38 @@ int blosclz_compress(int opt_level, const void* input,
   uint8_t* op = (uint8_t*) output;
 
   /* Hash table depends on the opt level.  Hash_log cannot be larger than 15. */
-  uint8_t hash_log_[10] = {-1, 8, 9, 9, 11, 11, 12, 13, 14, 15};
+  /* The parametrization below is made from playing with the bench suite, like:
+     $ bench/bench blosclz single 4
+     $ bench/bench blosclz single 4 4194280 12 25
+     and taking the minimum times on a i5-3380M @ 2.90GHz.
+     Curiously enough, values >= 14 does not always
+     get maximum compression, even with large blocksizes. */
+  int8_t hash_log_[10] = {-1, 11, 11, 11, 12, 13, 13, 13, 13, 13};
   uint8_t hash_log = hash_log_[opt_level];
   uint16_t hash_size = 1 << hash_log;
   uint16_t *htab;
   uint8_t* op_limit;
 
-  int32_t hslot;
   int32_t hval;
   uint8_t copy;
 
-  double maxlength_[10] = {-1, .1, .15, .2, .5, .7, .85, .925, .975, 1.0};
+  double maxlength_[10] = {-1, .1, .15, .2, .3, .45, .6, .75, .9, 1.0};
   int32_t maxlength = (int32_t) (length * maxlength_[opt_level]);
   if (maxlength > (int32_t) maxout) {
     maxlength = (int32_t) maxout;
   }
   op_limit = op + maxlength;
 
-  /* output buffer cannot be less than 66 bytes or we can get into problems.
-     As output is usually the same length than input, we take input length. */
-  if (length < 66) {
-    return 0;                   /* Mark this as uncompressible */
+  /* output buffer cannot be less than 66 bytes or we can get into trouble */
+  if (BLOSCLZ_UNEXPECT_CONDITIONAL(maxlength < 66 || length < 4)) {
+    return 0;
   }
 
-  htab = (uint16_t *) malloc(hash_size*sizeof(uint16_t));
-
-  /* sanity check */
-  if(BLOSCLZ_UNEXPECT_CONDITIONAL(length < 4)) {
-    if(length) {
-      /* create literal copy only */
-      *op++ = length-1;
-      ip_bound++;
-      while(ip <= ip_bound)
-        *op++ = *ip++;
-      free(htab);
-      return length+1;
-    }
-    else goto out;
-  }
+  /* prepare the acceleration to be used in condition */
+  accel = accel < 1 ? 1 : accel;
+  accel -= 1;
 
-  /* initializes hash table */
-  for (hslot = 0; hslot < hash_size; hslot++)
-    htab[hslot] = 0;
+  htab = (uint16_t *) calloc(hash_size, sizeof(uint16_t));
 
   /* we start with literal copy */
   copy = 2;
@@ -174,23 +238,25 @@ int blosclz_compress(int opt_level, const void* input,
     }
 
     /* find potential match */
-    hval = hash_function(ip, hash_log);
+    HASH_FUNCTION(hval, ip, hash_log);
     ref = ibase + htab[hval];
-    /* update hash table */
-    htab[hval] = (uint16_t)(anchor - ibase);
 
     /* calculate distance to the match */
     distance = (int32_t)(anchor - ref);
 
+    /* update hash table if necessary */
+    if ((distance & accel) == 0)
+      htab[hval] = (uint16_t)(anchor - ibase);
+
     /* is this a match? check the first 3 bytes */
     if (distance==0 || (distance >= MAX_FARDISTANCE) ||
         *ref++ != *ip++ || *ref++!=*ip++ || *ref++!=*ip++)
-      goto literal;
+      LITERAL(ip, op, op_limit, anchor, copy);
 
     /* far, needs at least 5-byte match */
-    if (distance >= MAX_DISTANCE) {
+    if (opt_level >= 5 && distance >= MAX_DISTANCE) {
       if (*ip++ != *ref++ || *ip++ != *ref++)
-        goto literal;
+        LITERAL(ip, op, op_limit, anchor, copy);
       len += 2;
     }
 
@@ -210,7 +276,11 @@ int blosclz_compress(int opt_level, const void* input,
       memset(&value, x, 8);
       /* safe because the outer check against ip limit */
       while (ip < (ip_bound - (sizeof(int64_t) - IP_BOUNDARY))) {
+#if !defined(BLOSCLZ_STRICT_ALIGN)
         value2 = ((int64_t *)ref)[0];
+#else
+        memcpy(&value2, ref, 8);
+#endif
         if (value != value2) {
           /* Find the byte that starts to differ */
           while (ip < ip_bound) {
@@ -233,18 +303,17 @@ int blosclz_compress(int opt_level, const void* input,
       for(;;) {
         /* safe because the outer check against ip limit */
         while (ip < (ip_bound - (sizeof(int64_t) - IP_BOUNDARY))) {
-          if (*ref++ != *ip++) break;
+#if !defined(BLOSCLZ_STRICT_ALIGN)
           if (((int64_t *)ref)[0] != ((int64_t *)ip)[0]) {
+#endif
             /* Find the byte that starts to differ */
             while (ip < ip_bound) {
               if (*ref++ != *ip++) break;
             }
             break;
-          }
-          else {
-            ip += 8;
-            ref += 8;
-          }
+#if !defined(BLOSCLZ_STRICT_ALIGN)
+          } else { ip += 8; ref += 8; }
+#endif
         }
         /* Last correction before exiting loop */
         if (ip > ip_bound) {
@@ -310,25 +379,13 @@ int blosclz_compress(int opt_level, const void* input,
     }
 
     /* update the hash at match boundary */
-    hval = hash_function(ip, hash_log);
+    HASH_FUNCTION(hval, ip, hash_log);
     htab[hval] = (uint16_t)(ip++ - ibase);
-    hval = hash_function(ip, hash_log);
+    HASH_FUNCTION(hval, ip, hash_log);
     htab[hval] = (uint16_t)(ip++ - ibase);
 
     /* assuming literal copy */
     *op++ = MAX_COPY-1;
-
-    continue;
-
-  literal:
-    if (BLOSCLZ_UNEXPECT_CONDITIONAL(op+2 > op_limit)) goto out;
-    *op++ = *anchor++;
-    ip = anchor;
-    copy++;
-    if(BLOSCLZ_UNEXPECT_CONDITIONAL(copy == MAX_COPY)) {
-      copy = 0;
-      *op++ = MAX_COPY-1;
-    }
   }
 
   /* left-over as literal copy */
@@ -361,7 +418,6 @@ int blosclz_compress(int opt_level, const void* input,
 
 }
 
-
 int blosclz_decompress(const void* input, int length, void* output, int maxout)
 {
   const uint8_t* ip = (const uint8_t*) input;
@@ -372,7 +428,7 @@ int blosclz_decompress(const void* input, int length, void* output, int maxout)
   int32_t loop = 1;
 
   do {
-    const uint8_t* ref = op;
+    uint8_t* ref = op;
     int32_t len = ctrl >> 5;
     int32_t ofs = (ctrl & 31) << 8;
 
@@ -421,21 +477,11 @@ int blosclz_decompress(const void* input, int length, void* output, int maxout)
         /* copy from reference */
         ref--;
         len += 3;
-        if (abs((int32_t)(ref-op)) <= (int32_t)len) {
-          /* src and dst do overlap: do a loop */
-          for(; len; --len)
-            *op++ = *ref++;
-          /* The memmove below does not work well (don't know why) */
-          /* memmove(op, ref, len);
-             op += len;
-             ref += len;
-             len = 0; */
-        }
-        else {
-          memcpy(op, ref, len);
-          op += len;
-          ref += len;
-        }
+#if !defined(_WIN32) && ((defined(__GNUC__) || defined(__INTEL_COMPILER) || !defined(__clang__)))
+        GCC_SAFE_COPY(op, ref, len, op_limit);
+#else
+        SAFE_COPY(op, ref, len, op_limit);
+#endif
       }
     }
     else {
@@ -449,9 +495,7 @@ int blosclz_decompress(const void* input, int length, void* output, int maxout)
       }
 #endif
 
-      memcpy(op, ip, ctrl);
-      ip += ctrl;
-      op += ctrl;
+      BLOCK_COPY(op, ip, ctrl, op_limit);
 
       loop = (int32_t)BLOSCLZ_EXPECT_CONDITIONAL(ip < ip_limit);
       if(loop)
diff --git a/thirdparty/blosc/blosclz.h b/thirdparty/blosc/blosclz.h
index f0e0011e92035a086d308cc74bad25b6da82278e..792a9b64c7d5e52ea4d33c0e1a7831aa15e6980b 100644
--- a/thirdparty/blosc/blosclz.h
+++ b/thirdparty/blosc/blosclz.h
@@ -1,7 +1,7 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
-  Author: Francesc Alted <faltet@gmail.com>
+  Author: Francesc Alted <francesc@blosc.org>
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
 **********************************************************************/
@@ -32,11 +32,16 @@ extern "C" {
   bytes, the return value will be 0 and you will have to discard the
   output buffer.
 
+  The acceleration parameter is related with the frequency for
+  updating the internal hash.  An acceleration of 1 means that the
+  internal hash is updated at full rate.  A value < 1 is not allowed
+  and will be silently set to 1.
+
   The input buffer and the output buffer can not overlap.
 */
 
-int blosclz_compress(int opt_level, const void* input, int length,
-                     void* output, int maxout);
+int blosclz_compress(const int opt_level, const void* input, int length,
+                     void* output, int maxout, int accel);
 
 /**
   Decompress a block of compressed data and returns the size of the
diff --git a/thirdparty/blosc/shuffle-common.h b/thirdparty/blosc/shuffle-common.h
new file mode 100644
index 0000000000000000000000000000000000000000..3dacd6ed9c2dffcf6b809c00f0b3cf2e753b7402
--- /dev/null
+++ b/thirdparty/blosc/shuffle-common.h
@@ -0,0 +1,34 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#ifndef SHUFFLE_COMMON_H
+#define SHUFFLE_COMMON_H
+
+#include "blosc-export.h"
+
+/* Define the __SSE2__ symbol if compiling with Visual C++ and
+   targeting the minimum architecture level supporting SSE2.
+   Other compilers define this as expected and emit warnings
+   when it is re-defined. */
+#if !defined(__SSE2__) && defined(_MSC_VER) && \
+    (defined(_M_X64) || (defined(_M_IX86) && _M_IX86_FP >= 2))
+  #define __SSE2__
+#endif
+
+/* Import standard integer type definitions */
+#if defined(_WIN32) && !defined(__MINGW32__)
+  #include <windows.h>
+  #include "win32/stdint-windows.h"
+#else
+  #include <stdint.h>
+  #include <stddef.h>
+  #include <inttypes.h>
+  #include <string.h>
+#endif  /* _WIN32 */
+
+#endif  /* SHUFFLE_COMMON_H */
diff --git a/thirdparty/blosc/shuffle-generic.c b/thirdparty/blosc/shuffle-generic.c
new file mode 100644
index 0000000000000000000000000000000000000000..46c6e8311fe13e1fe65f5e306548c46c357eba37
--- /dev/null
+++ b/thirdparty/blosc/shuffle-generic.c
@@ -0,0 +1,25 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#include "shuffle-generic.h"
+
+/* Shuffle a block.  This can never fail. */
+void shuffle_generic(const size_t bytesoftype, const size_t blocksize,
+		     const uint8_t* const _src, uint8_t* const _dest)
+{
+  /* Non-optimized shuffle */
+  shuffle_generic_inline(bytesoftype, 0, blocksize, _src, _dest);
+}
+
+/* Unshuffle a block.  This can never fail. */
+void unshuffle_generic(const size_t bytesoftype, const size_t blocksize,
+                       const uint8_t* const _src, uint8_t* const _dest)
+{
+  /* Non-optimized unshuffle */
+  unshuffle_generic_inline(bytesoftype, 0, blocksize, _src, _dest);
+}
diff --git a/thirdparty/blosc/shuffle-generic.h b/thirdparty/blosc/shuffle-generic.h
new file mode 100644
index 0000000000000000000000000000000000000000..c07a24920ad42160ce494ffee8182b0569794f93
--- /dev/null
+++ b/thirdparty/blosc/shuffle-generic.h
@@ -0,0 +1,99 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+/* Generic (non-hardware-accelerated) shuffle/unshuffle routines.
+   These are used when hardware-accelerated functions aren't available
+   for a particular platform; they are also used by the hardware-
+   accelerated functions to handle any remaining elements in a block
+   which isn't a multiple of the hardware's vector size. */
+
+#ifndef SHUFFLE_GENERIC_H
+#define SHUFFLE_GENERIC_H
+
+#include "shuffle-common.h"
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  Generic (non-hardware-accelerated) shuffle routine.
+  This is the pure element-copying nested loop. It is used by the
+  generic shuffle implementation and also by the vectorized shuffle
+  implementations to process any remaining elements in a block which
+  is not a multiple of (type_size * vector_size).
+*/
+static void shuffle_generic_inline(const size_t type_size,
+    const size_t vectorizable_blocksize, const size_t blocksize,
+    const uint8_t* const _src, uint8_t* const _dest)
+{
+  size_t i, j;
+  /* Calculate the number of elements in the block. */
+  const size_t neblock_quot = blocksize / type_size;
+  const size_t neblock_rem = blocksize % type_size;
+  const size_t vectorizable_elements = vectorizable_blocksize / type_size;
+
+
+  /* Non-optimized shuffle */
+  for (j = 0; j < type_size; j++) {
+    for (i = vectorizable_elements; i < (size_t)neblock_quot; i++) {
+      _dest[j*neblock_quot+i] = _src[i*type_size+j];
+    }
+  }
+
+  /* Copy any leftover bytes in the block without shuffling them. */
+  memcpy(_dest + (blocksize - neblock_rem), _src + (blocksize - neblock_rem), neblock_rem);
+}
+
+/**
+  Generic (non-hardware-accelerated) unshuffle routine.
+  This is the pure element-copying nested loop. It is used by the
+  generic unshuffle implementation and also by the vectorized unshuffle
+  implementations to process any remaining elements in a block which
+  is not a multiple of (type_size * vector_size).
+*/
+static void unshuffle_generic_inline(const size_t type_size,
+  const size_t vectorizable_blocksize, const size_t blocksize,
+  const uint8_t* const _src, uint8_t* const _dest)
+{
+  size_t i, j;
+
+  /* Calculate the number of elements in the block. */
+  const size_t neblock_quot = blocksize / type_size;
+  const size_t neblock_rem = blocksize % type_size;
+  const size_t vectorizable_elements = vectorizable_blocksize / type_size;
+
+  /* Non-optimized unshuffle */
+  for (i = vectorizable_elements; i < (size_t)neblock_quot; i++) {
+    for (j = 0; j < type_size; j++) {
+      _dest[i*type_size+j] = _src[j*neblock_quot+i];
+    }
+  }
+
+  /* Copy any leftover bytes in the block without unshuffling them. */
+  memcpy(_dest + (blocksize - neblock_rem), _src + (blocksize - neblock_rem), neblock_rem);
+}
+
+/**
+  Generic (non-hardware-accelerated) shuffle routine.
+*/
+BLOSC_NO_EXPORT void shuffle_generic(const size_t bytesoftype, const size_t blocksize,
+                                      const uint8_t* const _src, uint8_t* const _dest);
+
+/**
+  Generic (non-hardware-accelerated) unshuffle routine.
+*/
+BLOSC_NO_EXPORT void unshuffle_generic(const size_t bytesoftype, const size_t blocksize,
+                                        const uint8_t* const _src, uint8_t* const _dest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SHUFFLE_GENERIC_H */
diff --git a/thirdparty/blosc/shuffle.c b/thirdparty/blosc/shuffle.c
index cbc67014340851ca82d999aae862d8dad456f3fb..e8983bd6ea8cf73d60c46c83573d95cde061c6cf 100644
--- a/thirdparty/blosc/shuffle.c
+++ b/thirdparty/blosc/shuffle.c
@@ -1,502 +1,445 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
-  Author: Francesc Alted <faltet@gmail.com>
+  Author: Francesc Alted <francesc@blosc.org>
   Creation date: 2009-05-20
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
 **********************************************************************/
 
+#include "shuffle.h"
+#include "shuffle-common.h"
+#include "shuffle-generic.h"
+#include "bitshuffle-generic.h"
 #include <stdio.h>
 #include <string.h>
-#include "shuffle.h"
 
-#if defined(_WIN32) && !defined(__MINGW32__)
-  #include <windows.h>
-  #include "win32/stdint-windows.h"
-  #define __SSE2__          /* Windows does not define this by default */
+/* Visual Studio < 2013 does not have stdbool.h so here it is a replacement: */
+#if defined __STDC__ && defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
+/* have a C99 compiler */
+typedef _Bool bool;
 #else
-  #include <stdint.h>
-  #include <inttypes.h>
-#endif  /* _WIN32 */
-
+/* do not have a C99 compiler */
+typedef unsigned char bool;
+#endif
+static const bool false = 0;
+static const bool true = 1;
 
-/* The non-SSE2 versions of shuffle and unshuffle */
 
-/* Shuffle a block.  This can never fail. */
-static void _shuffle(size_t bytesoftype, size_t blocksize,
-	                 uint8_t* _src, uint8_t* _dest)
-{
-  size_t i, j, neblock, leftover;
+#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
+    __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+#define HAVE_CPU_FEAT_INTRIN
+#endif
 
-  /* Non-optimized shuffle */
-  neblock = blocksize / bytesoftype;  /* Number of elements in a block */
-  for (j = 0; j < bytesoftype; j++) {
-    for (i = 0; i < neblock; i++) {
-      _dest[j*neblock+i] = _src[i*bytesoftype+j];
-    }
+/*  Include hardware-accelerated shuffle/unshuffle routines based on
+    the target architecture. Note that a target architecture may support
+    more than one type of acceleration!*/
+#if defined(SHUFFLE_AVX2_ENABLED)
+  #include "shuffle-avx2.h"
+  #include "bitshuffle-avx2.h"
+#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
+
+#if defined(SHUFFLE_SSE2_ENABLED)
+  #include "shuffle-sse2.h"
+  #include "bitshuffle-sse2.h"
+#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
+
+
+/*  Define function pointer types for shuffle/unshuffle routines. */
+typedef void(*shuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
+typedef void(*unshuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
+typedef int64_t(*bitshuffle_func)(void*, void*, const size_t, const size_t, void*);
+typedef int64_t(*bitunshuffle_func)(void*, void*, const size_t, const size_t, void*);
+
+/* An implementation of shuffle/unshuffle routines. */
+typedef struct shuffle_implementation {
+  /* Name of this implementation. */
+  const char* name;
+  /* Function pointer to the shuffle routine for this implementation. */
+  shuffle_func shuffle;
+  /* Function pointer to the unshuffle routine for this implementation. */
+  unshuffle_func unshuffle;
+  /* Function pointer to the bitshuffle routine for this implementation. */
+  bitshuffle_func bitshuffle;
+  /* Function pointer to the bitunshuffle routine for this implementation. */
+  bitunshuffle_func bitunshuffle;
+} shuffle_implementation_t;
+
+typedef enum {
+  BLOSC_HAVE_NOTHING = 0,
+  BLOSC_HAVE_SSE2 = 1,
+  BLOSC_HAVE_AVX2 = 2
+} blosc_cpu_features;
+
+/*  Detect hardware and set function pointers to the best shuffle/unshuffle
+    implementations supported by the host processor. */
+#if defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED)    /* Intel/i686 */
+
+/*  Disabled the __builtin_cpu_supports() call, as it has issues with
+    new versions of gcc (like 5.3.1 in forthcoming ubuntu/xenial:
+      "undefined symbol: __cpu_model"
+    For a similar report, see:
+    https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/
+*/
+#if defined(HAVE_CPU_FEAT_INTRIN) && 0
+static blosc_cpu_features blosc_get_cpu_features(void) {
+  blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
+  if (__builtin_cpu_supports("sse2")) {
+    cpu_features |= BLOSC_HAVE_SSE2;
   }
-  leftover = blocksize % bytesoftype;
-  memcpy(_dest + neblock*bytesoftype, _src + neblock*bytesoftype, leftover);
-}
-
-/* Unshuffle a block.  This can never fail. */
-static void _unshuffle(size_t bytesoftype, size_t blocksize,
-                       uint8_t* _src, uint8_t* _dest)
-{
-  size_t i, j, neblock, leftover;
-
-  /* Non-optimized unshuffle */
-  neblock = blocksize / bytesoftype;  /* Number of elements in a block */
-  for (i = 0; i < neblock; i++) {
-    for (j = 0; j < bytesoftype; j++) {
-      _dest[i*bytesoftype+j] = _src[j*neblock+i];
-    }
+  if (__builtin_cpu_supports("avx2")) {
+    cpu_features |= BLOSC_HAVE_AVX2;
   }
-  leftover = blocksize % bytesoftype;
-  memcpy(_dest+neblock*bytesoftype, _src+neblock*bytesoftype, leftover);
+  return cpu_features;
 }
+#else
 
+#if defined(_MSC_VER) && !defined(__clang__)
+  #include <intrin.h>     /* Needed for __cpuid */
 
-#ifdef __SSE2__
-
-/* The SSE2 versions of shuffle and unshuffle */
-
-#include <emmintrin.h>
-
-/* The next is useful for debugging purposes */
-#if 0
-static void printxmm(__m128i xmm0)
-{
-  uint8_t buf[16];
-
-  ((__m128i *)buf)[0] = xmm0;
-  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
-          buf[0], buf[1], buf[2], buf[3],
-          buf[4], buf[5], buf[6], buf[7],
-          buf[8], buf[9], buf[10], buf[11],
-          buf[12], buf[13], buf[14], buf[15]);
-}
-#endif
+/*  _xgetbv is only supported by VS2010 SP1 and newer versions of VS. */
+#if _MSC_FULL_VER >= 160040219
+  #include <immintrin.h>  /* Needed for _xgetbv */
+#elif defined(_M_IX86)
 
+/*  Implement _xgetbv for VS2008 and VS2010 RTM with 32-bit (x86) targets. */
 
-/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
-static void
-shuffle2(uint8_t* dest, uint8_t* src, size_t size)
-{
-  size_t i, j, k;
-  size_t numof16belem;
-  __m128i xmm0[2], xmm1[2];
-
-  numof16belem = size / (16*2);
-  for (i = 0, j = 0; i < numof16belem; i++, j += 16*2) {
-    /* Fetch and transpose bytes, words and double words in groups of
-       32 bytes */
-    for (k = 0; k < 2; k++) {
-      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
-      xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
-      xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
-      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
-      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
-      xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
-      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
-      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
-      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
-      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
-    }
-    /* Transpose quad words */
-    for (k = 0; k < 1; k++) {
-      xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]);
-      xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]);
-    }
-    /* Store the result vectors */
-    for (k = 0; k < 2; k++) {
-      ((__m128i *)dest)[k*numof16belem+i] = xmm1[k];
+static uint64_t _xgetbv(uint32_t xcr) {
+    uint32_t xcr0, xcr1;
+    __asm {
+        mov        ecx, xcr
+        _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
+        mov        xcr0, eax
+        mov        xcr1, edx
     }
-  }
+    return ((uint64_t)xcr1 << 32) | xcr0;
 }
 
+#elif defined(_M_X64)
 
-/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
-static void
-shuffle4(uint8_t* dest, uint8_t* src, size_t size)
-{
-  size_t i, j, k;
-  size_t numof16belem;
-  __m128i xmm0[4], xmm1[4];
-
-  numof16belem = size / (16*4);
-  for (i = 0, j = 0; i < numof16belem; i++, j += 16*4) {
-    /* Fetch and transpose bytes and words in groups of 64 bytes */
-    for (k = 0; k < 4; k++) {
-      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
-      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
-      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0x8d);
-      xmm0[k] = _mm_unpacklo_epi8(xmm1[k], xmm0[k]);
-      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x04e);
-      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
-    }
-    /* Transpose double words */
-    for (k = 0; k < 2; k++) {
-      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[k*2], xmm0[k*2+1]);
-      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[k*2], xmm0[k*2+1]);
-    }
-    /* Transpose quad words */
-    for (k = 0; k < 2; k++) {
-      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+2]);
-      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+2]);
-    }
-    /* Store the result vectors */
-    for (k = 0; k < 4; k++) {
-      ((__m128i *)dest)[k*numof16belem+i] = xmm0[k];
-    }
-  }
+/*  Implement _xgetbv for VS2008 and VS2010 RTM with 64-bit (x64) targets.
+    These compilers don't support any of the newer acceleration ISAs
+    (e.g., AVX2) supported by blosc, and all x64 hardware supports SSE2
+    which means we can get away with returning a hard-coded value from
+    this implementation of _xgetbv. */
+
+static inline uint64_t
+_xgetbv(uint32_t xcr) {
+    /* A 64-bit OS must have XMM save support. */
+    return xcr == 0 ? (1UL << 1) : 0UL;
 }
 
+#else
 
-/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
-static void
-shuffle8(uint8_t* dest, uint8_t* src, size_t size)
-{
-  size_t i, j, k, l;
-  size_t numof16belem;
-  __m128i xmm0[8], xmm1[8];
-
-  numof16belem = size / (16*8);
-  for (i = 0, j = 0; i < numof16belem; i++, j += 16*8) {
-    /* Fetch and transpose bytes in groups of 128 bytes */
-    for (k = 0; k < 8; k++) {
-      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
-      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
-      xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
-    }
-    /* Transpose words */
-    for (k = 0, l = 0; k < 4; k++, l +=2) {
-      xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+1]);
-      xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+1]);
-    }
-    /* Transpose double words */
-    for (k = 0, l = 0; k < 4; k++, l++) {
-      if (k == 2) l += 2;
-      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+2]);
-      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+2]);
-    }
-    /* Transpose quad words */
-    for (k = 0; k < 4; k++) {
-      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+4]);
-      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+4]);
-    }
-    /* Store the result vectors */
-    for (k = 0; k < 8; k++) {
-      ((__m128i *)dest)[k*numof16belem+i] = xmm0[k];
-    }
-  }
-}
+/* Hardware detection for any other MSVC targets (e.g., ARM)
+   isn't implemented at this time. */
+#error This version of c-blosc only supports x86 and x64 targets with MSVC.
 
+#endif /* _MSC_FULL_VER >= 160040219 */
+  
+#else
 
-/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
-static void
-shuffle16(uint8_t* dest, uint8_t* src, size_t size)
-{
-  size_t i, j, k, l;
-  size_t numof16belem;
-  __m128i xmm0[16], xmm1[16];
+/*  Implement the __cpuid and __cpuidex intrinsics for GCC, Clang,
+    and others using inline assembly. */
+__attribute__((always_inline))
+static inline void
+__cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) {
+  __asm__ __volatile__ (
+# if defined(__i386__) && defined (__PIC__)
+  /*  Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored.
+      https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+  */
+    "movl %%ebx, %%edi\n\t"
+    "cpuid\n\t"
+    "xchgl %%ebx, %%edi":
+    "=D" (cpuInfo[1]),
+#else
+    "cpuid":
+    "=b" (cpuInfo[1]),
+#endif  /* defined(__i386) && defined(__PIC__) */
+    "=a" (cpuInfo[0]),
+    "=c" (cpuInfo[2]),
+    "=d" (cpuInfo[3]) :
+    "a" (function_id), "c" (subfunction_id)
+    );
+}
 
-  numof16belem = size / (16*16);
-  for (i = 0, j = 0; i < numof16belem; i++, j += 16*16) {
-    /* Fetch elements in groups of 256 bytes */
-    for (k = 0; k < 16; k++) {
-      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
-    }
-    /* Transpose bytes */
-    for (k = 0, l = 0; k < 8; k++, l +=2) {
-      xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]);
-      xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]);
-    }
-    /* Transpose words */
-    for (k = 0, l = -2; k < 8; k++, l++) {
-      if ((k%2) == 0) l += 2;
-      xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]);
-      xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]);
-    }
-    /* Transpose double words */
-    for (k = 0, l = -4; k < 8; k++, l++) {
-      if ((k%4) == 0) l += 4;
-      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]);
-      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]);
-    }
-    /* Transpose quad words */
-    for (k = 0; k < 8; k++) {
-      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]);
-      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]);
-    }
-    /* Store the result vectors */
-    for (k = 0; k < 16; k++) {
-      ((__m128i *)dest)[k*numof16belem+i] = xmm0[k];
-    }
-  }
+#define __cpuid(cpuInfo, function_id) __cpuidex(cpuInfo, function_id, 0)
+
+#define _XCR_XFEATURE_ENABLED_MASK 0
+
+/* Reads the content of an extended control register.
+   https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+*/
+static inline uint64_t
+_xgetbv(uint32_t xcr) {
+  uint32_t eax, edx;
+  __asm__ __volatile__ (
+    /* "xgetbv"
+       This is specified as raw instruction bytes due to some older compilers
+       having issues with the mnemonic form.
+    */
+    ".byte 0x0f, 0x01, 0xd0":
+    "=a" (eax),
+    "=d" (edx) :
+    "c" (xcr)
+    );
+  return ((uint64_t)edx << 32) | eax;
 }
 
+#endif /* defined(_MSC_FULL_VER) */
 
-/* Shuffle a block.  This can never fail. */
-void shuffle(size_t bytesoftype, size_t blocksize,
-             uint8_t* _src, uint8_t* _dest) {
-  int unaligned_dest = (int)((uintptr_t)_dest % 16);
-  int power_of_two = (blocksize & (blocksize - 1)) == 0;
-  int too_small = (blocksize < 256);
+#ifndef _XCR_XFEATURE_ENABLED_MASK
+#define _XCR_XFEATURE_ENABLED_MASK 0x0
+#endif
 
-  if (unaligned_dest || !power_of_two || too_small) {
-    /* _dest buffer is not aligned, not a power of two or is too
-       small.  Call the non-sse2 version. */
-    _shuffle(bytesoftype, blocksize, _src, _dest);
-    return;
+static blosc_cpu_features blosc_get_cpu_features(void) {
+  blosc_cpu_features result = BLOSC_HAVE_NOTHING;
+  int32_t max_basic_function_id;
+  /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */
+  int32_t cpu_info[4];
+  int sse2_available;
+  int sse3_available;
+  int ssse3_available;
+  int sse41_available;
+  int sse42_available;
+  int xsave_available;
+  int xsave_enabled_by_os;
+  int avx2_available = 0;
+  int avx512bw_available = 0;
+  int xmm_state_enabled = 0;
+  int ymm_state_enabled = 0;
+  int zmm_state_enabled = 0;
+  uint64_t xcr0_contents;
+
+  /* Get the number of basic functions available. */
+  __cpuid(cpu_info, 0);
+  max_basic_function_id = cpu_info[0];
+
+  /* Check for SSE-based features and required OS support */
+  __cpuid(cpu_info, 1);
+  sse2_available = (cpu_info[3] & (1 << 26)) != 0;
+  sse3_available = (cpu_info[2] & (1 << 0)) != 0;
+  ssse3_available = (cpu_info[2] & (1 << 9)) != 0;
+  sse41_available = (cpu_info[2] & (1 << 19)) != 0;
+  sse42_available = (cpu_info[2] & (1 << 20)) != 0;
+
+  xsave_available = (cpu_info[2] & (1 << 26)) != 0;
+  xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0;
+
+  /* Check for AVX-based features, if the processor supports extended features. */
+  if (max_basic_function_id >= 7) {
+    __cpuid(cpu_info, 7);
+    avx2_available = (cpu_info[1] & (1 << 5)) != 0;
+    avx512bw_available = (cpu_info[1] & (1 << 30)) != 0;
   }
 
-  /* Optimized shuffle */
-  /* The buffer must be aligned on a 16 bytes boundary, have a power */
-  /* of 2 size and be larger or equal than 256 bytes. */
-  if (bytesoftype == 4) {
-    shuffle4(_dest, _src, blocksize);
-  }
-  else if (bytesoftype == 8) {
-    shuffle8(_dest, _src, blocksize);
+  /*  Even if certain features are supported by the CPU, they may not be supported
+      by the OS (in which case using them would crash the process or system).
+      If xsave is available and enabled by the OS, check the contents of the
+      extended control register XCR0 to see if the CPU features are enabled. */
+#if defined(_XCR_XFEATURE_ENABLED_MASK)
+  if (xsave_available && xsave_enabled_by_os && (
+      sse2_available || sse3_available || ssse3_available
+      || sse41_available || sse42_available
+      || avx2_available || avx512bw_available)) {
+    /* Determine which register states can be restored by the OS. */
+    xcr0_contents = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+
+    xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0;
+    ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0;
+
+    /*  Require support for both the upper 256-bits of zmm0-zmm15 to be
+        restored as well as all of zmm16-zmm31 and the opmask registers. */
+    zmm_state_enabled = (xcr0_contents & 0x70) == 0x70;
   }
-  else if (bytesoftype == 16) {
-    shuffle16(_dest, _src, blocksize);
+#endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */
+
+#if defined(BLOSC_DUMP_CPU_INFO)
+  printf("Shuffle CPU Information:\n");
+  printf("SSE2 available: %s\n", sse2_available ? "True" : "False");
+  printf("SSE3 available: %s\n", sse3_available ? "True" : "False");
+  printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False");
+  printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False");
+  printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False");
+  printf("AVX2 available: %s\n", avx2_available ? "True" : "False");
+  printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False");
+  printf("XSAVE available: %s\n", xsave_available ? "True" : "False");
+  printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False");
+  printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False");
+  printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False");
+  printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False");
+#endif /* defined(BLOSC_DUMP_CPU_INFO) */
+
+  /* Using the gathered CPU information, determine which implementation to use. */
+  /* technically could fail on sse2 cpu on os without xmm support, but that
+   * shouldn't exist anymore */
+  if (sse2_available) {
+    result |= BLOSC_HAVE_SSE2;
   }
-  else if (bytesoftype == 2) {
-    shuffle2(_dest, _src, blocksize);
-  }
-  else {
-    /* Non-optimized shuffle */
-    _shuffle(bytesoftype, blocksize, _src, _dest);
+  if (xmm_state_enabled && ymm_state_enabled && avx2_available) {
+    result |= BLOSC_HAVE_AVX2;
   }
+  return result;
 }
+#endif
 
+#else   /* No hardware acceleration supported for the target architecture. */
+  #if defined(_MSC_VER)
+  #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.")
+  #else
+  #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.
+  #endif
 
-/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
-static void
-unshuffle2(uint8_t* dest, uint8_t* orig, size_t size)
-{
-  size_t i, k;
-  size_t neblock, numof16belem;
-  __m128i xmm1[2], xmm2[2];
-
-  neblock = size / 2;
-  numof16belem = neblock / 16;
-  for (i = 0, k = 0; i < numof16belem; i++, k += 2) {
-    /* Load the first 32 bytes in 2 XMM registrers */
-    xmm1[0] = ((__m128i *)orig)[0*numof16belem+i];
-    xmm1[1] = ((__m128i *)orig)[1*numof16belem+i];
-    /* Shuffle bytes */
-    /* Compute the low 32 bytes */
-    xmm2[0] = _mm_unpacklo_epi8(xmm1[0], xmm1[1]);
-    /* Compute the hi 32 bytes */
-    xmm2[1] = _mm_unpackhi_epi8(xmm1[0], xmm1[1]);
-    /* Store the result vectors in proper order */
-    ((__m128i *)dest)[k+0] = xmm2[0];
-    ((__m128i *)dest)[k+1] = xmm2[1];
-  }
+static blosc_cpu_features blosc_get_cpu_features(void) {
+  return BLOSC_HAVE_NOTHING;
 }
 
+#endif
 
-/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
-static void
-unshuffle4(uint8_t* dest, uint8_t* orig, size_t size)
-{
-  size_t i, j, k;
-  size_t neblock, numof16belem;
-  __m128i xmm0[4], xmm1[4];
-
-  neblock = size / 4;
-  numof16belem = neblock / 16;
-  for (i = 0, k = 0; i < numof16belem; i++, k += 4) {
-    /* Load the first 64 bytes in 4 XMM registrers */
-    for (j = 0; j < 4; j++) {
-      xmm0[j] = ((__m128i *)orig)[j*numof16belem+i];
-    }
-    /* Shuffle bytes */
-    for (j = 0; j < 2; j++) {
-      /* Compute the low 32 bytes */
-      xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm1[2+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]);
-    }
-    /* Shuffle 2-byte words */
-    for (j = 0; j < 2; j++) {
-      /* Compute the low 32 bytes */
-      xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm0[2+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]);
-    }
-    /* Store the result vectors in proper order */
-    ((__m128i *)dest)[k+0] = xmm0[0];
-    ((__m128i *)dest)[k+1] = xmm0[2];
-    ((__m128i *)dest)[k+2] = xmm0[1];
-    ((__m128i *)dest)[k+3] = xmm0[3];
+static shuffle_implementation_t get_shuffle_implementation() {
+  blosc_cpu_features cpu_features = blosc_get_cpu_features();
+  shuffle_implementation_t impl_generic;
+
+#if defined(SHUFFLE_AVX2_ENABLED)
+  if (cpu_features & BLOSC_HAVE_AVX2) {
+    shuffle_implementation_t impl_avx2;
+    impl_avx2.name = "avx2";
+    impl_avx2.shuffle = (shuffle_func)shuffle_avx2;
+    impl_avx2.unshuffle = (unshuffle_func)unshuffle_avx2;
+    impl_avx2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_avx2;
+    impl_avx2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_avx2;
+    return impl_avx2;
   }
-}
-
-
-/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
-static void
-unshuffle8(uint8_t* dest, uint8_t* orig, size_t size)
-{
-  size_t i, j, k;
-  size_t neblock, numof16belem;
-  __m128i xmm0[8], xmm1[8];
-
-  neblock = size / 8;
-  numof16belem = neblock / 16;
-  for (i = 0, k = 0; i < numof16belem; i++, k += 8) {
-    /* Load the first 64 bytes in 8 XMM registrers */
-    for (j = 0; j < 8; j++) {
-      xmm0[j] = ((__m128i *)orig)[j*numof16belem+i];
-    }
-    /* Shuffle bytes */
-    for (j = 0; j < 4; j++) {
-      /* Compute the low 32 bytes */
-      xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm1[4+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]);
-    }
-    /* Shuffle 2-byte words */
-    for (j = 0; j < 4; j++) {
-      /* Compute the low 32 bytes */
-      xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm0[4+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]);
-    }
-    /* Shuffle 4-byte dwords */
-    for (j = 0; j < 4; j++) {
-      /* Compute the low 32 bytes */
-      xmm1[j] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm1[4+j] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]);
-    }
-    /* Store the result vectors in proper order */
-    ((__m128i *)dest)[k+0] = xmm1[0];
-    ((__m128i *)dest)[k+1] = xmm1[4];
-    ((__m128i *)dest)[k+2] = xmm1[2];
-    ((__m128i *)dest)[k+3] = xmm1[6];
-    ((__m128i *)dest)[k+4] = xmm1[1];
-    ((__m128i *)dest)[k+5] = xmm1[5];
-    ((__m128i *)dest)[k+6] = xmm1[3];
-    ((__m128i *)dest)[k+7] = xmm1[7];
+#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
+
+#if defined(SHUFFLE_SSE2_ENABLED)
+  if (cpu_features & BLOSC_HAVE_SSE2) {
+    shuffle_implementation_t impl_sse2;
+    impl_sse2.name = "sse2";
+    impl_sse2.shuffle = (shuffle_func)shuffle_sse2;
+    impl_sse2.unshuffle = (unshuffle_func)unshuffle_sse2;
+    impl_sse2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_sse2;
+    impl_sse2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_sse2;
+    return impl_sse2;
   }
+#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
+
+  /*  Processor doesn't support any of the hardware-accelerated implementations,
+      so use the generic implementation. */
+  impl_generic.name = "generic";
+  impl_generic.shuffle = (shuffle_func)shuffle_generic;
+  impl_generic.unshuffle = (unshuffle_func)unshuffle_generic;
+  impl_generic.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_scal;
+  impl_generic.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_scal;
+  return impl_generic;
 }
 
 
-/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
-static void
-unshuffle16(uint8_t* dest, uint8_t* orig, size_t size)
-{
-  size_t i, j, k;
-  size_t neblock, numof16belem;
-  __m128i xmm1[16], xmm2[16];
-
-  neblock = size / 16;
-  numof16belem = neblock / 16;
-  for (i = 0, k = 0; i < numof16belem; i++, k += 16) {
-    /* Load the first 128 bytes in 16 XMM registrers */
-    for (j = 0; j < 16; j++) {
-      xmm1[j] = ((__m128i *)orig)[j*numof16belem+i];
-    }
-    /* Shuffle bytes */
-    for (j = 0; j < 8; j++) {
-      /* Compute the low 32 bytes */
-      xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]);
-    }
-    /* Shuffle 2-byte words */
-    for (j = 0; j < 8; j++) {
-      /* Compute the low 32 bytes */
-      xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]);
-    }
-    /* Shuffle 4-byte dwords */
-    for (j = 0; j < 8; j++) {
-      /* Compute the low 32 bytes */
-      xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]);
-    }
-    /* Shuffle 8-byte qwords */
-    for (j = 0; j < 8; j++) {
-      /* Compute the low 32 bytes */
-      xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]);
-    }
-    /* Store the result vectors in proper order */
-    ((__m128i *)dest)[k+0] = xmm1[0];
-    ((__m128i *)dest)[k+1] = xmm1[8];
-    ((__m128i *)dest)[k+2] = xmm1[4];
-    ((__m128i *)dest)[k+3] = xmm1[12];
-    ((__m128i *)dest)[k+4] = xmm1[2];
-    ((__m128i *)dest)[k+5] = xmm1[10];
-    ((__m128i *)dest)[k+6] = xmm1[6];
-    ((__m128i *)dest)[k+7] = xmm1[14];
-    ((__m128i *)dest)[k+8] = xmm1[1];
-    ((__m128i *)dest)[k+9] = xmm1[9];
-    ((__m128i *)dest)[k+10] = xmm1[5];
-    ((__m128i *)dest)[k+11] = xmm1[13];
-    ((__m128i *)dest)[k+12] = xmm1[3];
-    ((__m128i *)dest)[k+13] = xmm1[11];
-    ((__m128i *)dest)[k+14] = xmm1[7];
-    ((__m128i *)dest)[k+15] = xmm1[15];
-  }
-}
-
+/*  Flag indicating whether the implementation has been initialized.
+    Zero means it hasn't been initialized, non-zero means it has. */
+static int32_t implementation_initialized;
 
-/* Unshuffle a block.  This can never fail. */
-void unshuffle(size_t bytesoftype, size_t blocksize,
-               uint8_t* _src, uint8_t* _dest) {
-  int unaligned_src = (int)((uintptr_t)_src % 16);
-  int unaligned_dest = (int)((uintptr_t)_dest % 16);
-  int power_of_two = (blocksize & (blocksize - 1)) == 0;
-  int too_small = (blocksize < 256);
+/*  The dynamically-chosen shuffle/unshuffle implementation.
+    This is only safe to use once `implementation_initialized` is set. */
+static shuffle_implementation_t host_implementation;
 
-  if (unaligned_src || unaligned_dest || !power_of_two || too_small) {
-    /* _src or _dest buffer is not aligned, not a power of two or is
-       too small.  Call the non-sse2 version. */
-    _unshuffle(bytesoftype, blocksize, _src, _dest);
-    return;
-  }
+/*  Initialize the shuffle implementation, if necessary. */
+#if defined(__GNUC__) || defined(__clang__)
+__attribute__((always_inline))
+#endif
+static
+#if defined(_MSC_VER)
+__forceinline
+#else
+inline
+#endif
+void init_shuffle_implementation() {
+  /* Initialization could (in rare cases) take place concurrently on
+     multiple threads, but it shouldn't matter because the
+     initialization should return the same result on each thread (so
+     the implementation will be the same). Since that's the case we
+     can avoid complicated synchronization here and get a small
+     performance benefit because we don't need to perform a volatile
+     load on the initialization variable each time this function is
+     called. */
+#if defined(__GNUC__) || defined(__clang__)
+  if (__builtin_expect(!implementation_initialized, 0)) {
+#else
+  if (!implementation_initialized) {
+#endif
+    /* Initialize the implementation. */
+    host_implementation = get_shuffle_implementation();
 
-  /* Optimized unshuffle */
-  /* The buffers must be aligned on a 16 bytes boundary, have a power */
-  /* of 2 size and be larger or equal than 256 bytes. */
-  if (bytesoftype == 4) {
-    unshuffle4(_dest, _src, blocksize);
-  }
-  else if (bytesoftype == 8) {
-    unshuffle8(_dest, _src, blocksize);
-  }
-  else if (bytesoftype == 16) {
-    unshuffle16(_dest, _src, blocksize);
-  }
-  else if (bytesoftype == 2) {
-    unshuffle2(_dest, _src, blocksize);
-  }
-  else {
-    /* Non-optimized unshuffle */
-    _unshuffle(bytesoftype, blocksize, _src, _dest);
+    /*  Set the flag indicating the implementation has been initialized. */
+    implementation_initialized = 1;
   }
 }
 
-#else   /* no __SSE2__ available */
+/*  Shuffle a block by dynamically dispatching to the appropriate
+    hardware-accelerated routine at run-time. */
+void
+shuffle(const size_t bytesoftype, const size_t blocksize,
+        const uint8_t* _src, const uint8_t* _dest) {
+  /* Initialize the shuffle implementation if necessary. */
+  init_shuffle_implementation();
+
+  /*  The implementation is initialized.
+      Dispatch to it's shuffle routine. */
+  (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest);
+}
 
-void shuffle(size_t bytesoftype, size_t blocksize,
-             uint8_t* _src, uint8_t* _dest) {
-  _shuffle(bytesoftype, blocksize, _src, _dest);
+/*  Unshuffle a block by dynamically dispatching to the appropriate
+    hardware-accelerated routine at run-time. */
+void
+unshuffle(const size_t bytesoftype, const size_t blocksize,
+          const uint8_t* _src, const uint8_t* _dest) {
+  /* Initialize the shuffle implementation if necessary. */
+  init_shuffle_implementation();
+
+  /*  The implementation is initialized.
+      Dispatch to it's unshuffle routine. */
+  (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest);
 }
 
-void unshuffle(size_t bytesoftype, size_t blocksize,
-               uint8_t* _src, uint8_t* _dest) {
-  _unshuffle(bytesoftype, blocksize, _src, _dest);
+/*  Bit-shuffle a block by dynamically dispatching to the appropriate
+    hardware-accelerated routine at run-time. */
+int
+bitshuffle(const size_t bytesoftype, const size_t blocksize,
+           const uint8_t* const _src, const uint8_t* _dest,
+           const uint8_t* _tmp) {
+  int size = blocksize / bytesoftype;
+  /* Initialize the shuffle implementation if necessary. */
+  init_shuffle_implementation();
+
+  if ((size % 8) == 0)
+    /* The number of elems is a multiple of 8 which is supported by
+       bitshuffle. */
+    return (int)(host_implementation.bitshuffle)((void*)_src, (void*)_dest,
+                                                 blocksize / bytesoftype,
+                                                 bytesoftype, (void*)_tmp);
+  else
+    memcpy((void*)_dest, (void*)_src, blocksize);
+  return size;
 }
 
-#endif  /* __SSE2__ */
+/*  Bit-unshuffle a block by dynamically dispatching to the appropriate
+    hardware-accelerated routine at run-time. */
+int
+bitunshuffle(const size_t bytesoftype, const size_t blocksize,
+             const uint8_t* const _src, const uint8_t* _dest,
+             const uint8_t* _tmp) {
+  int size = blocksize / bytesoftype;
+  /* Initialize the shuffle implementation if necessary. */
+  init_shuffle_implementation();
+
+  if ((size % 8) == 0)
+    /* The number of elems is a multiple of 8 which is supported by
+       bitshuffle. */
+    return (int)(host_implementation.bitunshuffle)((void*)_src, (void*)_dest,
+                                                   blocksize / bytesoftype,
+                                                   bytesoftype, (void*)_tmp);
+  else
+    memcpy((void*)_dest, (void*)_src, blocksize);
+  return size;
+}
diff --git a/thirdparty/blosc/shuffle.h b/thirdparty/blosc/shuffle.h
index b56c5e16353adcfd22cbb5179e51cf70aa6d0b1c..d0b6ddc6e4ae49ccdc7881b8181a780ca848e1a2 100644
--- a/thirdparty/blosc/shuffle.h
+++ b/thirdparty/blosc/shuffle.h
@@ -1,16 +1,67 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
-  Author: Francesc Alted <faltet@gmail.com>
+  Author: Francesc Alted <francesc@blosc.org>
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
 **********************************************************************/
 
+/*  Shuffle/unshuffle routines which dynamically dispatch to hardware-
+    accelerated routines based on the processor's architecture.
+    Consumers should almost always prefer to call these routines instead
+    of directly calling one of the hardware-accelerated routines, since
+    these are cross-platform and future-proof. */
 
-/* Shuffle/unshuffle routines */
+#ifndef SHUFFLE_H
+#define SHUFFLE_H
 
-void shuffle(size_t bytesoftype, size_t blocksize,
-             unsigned char* _src, unsigned char* _dest);
+#include "shuffle-common.h"
 
-void unshuffle(size_t bytesoftype, size_t blocksize,
-               unsigned char* _src, unsigned char* _dest);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  Primary shuffle and bitshuffle routines.
+  This function dynamically dispatches to the appropriate hardware-accelerated
+  routine based on the host processor's architecture. If the host processor is
+  not supported by any of the hardware-accelerated routines, the generic
+  (non-accelerated) implementation is used instead.
+  Consumers should almost always prefer to call this routine instead of directly
+  calling the hardware-accelerated routines because this method is both cross-
+  platform and future-proof.
+*/
+BLOSC_NO_EXPORT void
+shuffle(const size_t bytesoftype, const size_t blocksize,
+        const uint8_t* _src, const uint8_t* _dest);
+
+BLOSC_NO_EXPORT int
+bitshuffle(const size_t bytesoftype, const size_t blocksize,
+           const uint8_t* const _src, const uint8_t* _dest,
+           const uint8_t* _tmp);
+
+/**
+  Primary unshuffle and bitunshuffle routine.
+  This function dynamically dispatches to the appropriate hardware-accelerated
+  routine based on the host processor's architecture. If the host processor is
+  not supported by any of the hardware-accelerated routines, the generic
+  (non-accelerated) implementation is used instead.
+  Consumers should almost always prefer to call this routine instead of directly
+  calling the hardware-accelerated routines because this method is both cross-
+  platform and future-proof.
+*/
+BLOSC_NO_EXPORT void
+unshuffle(const size_t bytesoftype, const size_t blocksize,
+          const uint8_t* _src, const uint8_t* _dest);
+
+
+BLOSC_NO_EXPORT int
+bitunshuffle(const size_t bytesoftype, const size_t blocksize,
+             const uint8_t* const _src, const uint8_t* _dest,
+             const uint8_t* _tmp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SHUFFLE_H */