diff --git a/README.md b/README.md index 018c371951cfa168519f5563706904fe1c3493c7..659ea68f9935f15feea650caeaff6d7479016f4f 100644 --- a/README.md +++ b/README.md @@ -75,46 +75,35 @@ The `pygio` library is pip-installable and works with `mpi4py`. **Requirements** -Currently, a **CMake version >= 3.11.0** is required to fetch dependencies -during configuration. The ``pygio`` module also requires MPI libraries to be -findable by CMake's FindMPI. The compiler needs to support **C++17** (make sure -that ``CC`` and ``CXX`` point to the correct compiler) +- Currently, a **CMake version >= 3.11.0** is required to fetch dependencies +during configuration. If the system does not provide a suitable `cmake` version, +`pip` should (theoretically) download `cmake` from the PyPI repository. +- The ``pygio`` module also requires MPI libraries to be detectable by CMake's +FindMPI. The compiler needs to support **C++17** (make sure that ``CC`` and +``CXX`` point to the correct compiler). **Install** The python library can be installed by running pip in the **main folder**: ```bash -pip install . +python -m pip install . +``` +Alternatively, the library can also directly be installed from the git URL +without having to clone the repository first: +```bash +python -m pip install git+https://git.cels.anl.gov/hacc/genericio.git ``` It will use the compiler referred by the ``CC`` and ``CXX`` environment -variable. If the compiler supports OpenMP, the library will be threaded. Make -sure to set ``OMP_NUM_THREADS`` to an appropriate variable, in particluar when -using multiple MPI ranks per node. - ------ - -## Output file partitions (subfiles) - -If you're running on an IBM BG/Q supercomputer, then the number of subfiles -(partitions) chosen is based on the I/O nodes in an automatic way. Otherwise, by -default, the GenericIO library picks the number of subfiles based on a -fairly-naive hostname-based hashing scheme. This works reasonably-well on small -clusters, but not on larger systems. On a larger system, you might want to set -these environmental variables: +variable. In case the automatically detected compiler is incorrect, specify the +compiler path as ```bash -GENERICIO_PARTITIONS_USE_NAME=0 -GENERICIO_RANK_PARTITIONS=256 +CC=/path/to/gcc CXX=/path/to/g++ python -m pip install . ``` -Where the number of partitions (256 above) determines the number of subfiles -used. If you're using a Lustre file system, for example, an optimal number of -files is: -``` -# of files * stripe count ~ # OSTs -``` +If the compiler supports OpenMP, the library will be threaded. Make +sure to set ``OMP_NUM_THREADS`` to an appropriate variable, in particluar when +using multiple MPI ranks per node. -On Titan, for example, there are 1008 OSTs, and a default stripe count of 4, so -we use approximately 256 files. diff --git a/docs/environment.rst b/docs/environment.rst index 7dc96caf4bbf3047d542271511c97d91eecff868..4e03e35f9bea8ab26fa08838e3c642bb62717c5a 100644 --- a/docs/environment.rst +++ b/docs/environment.rst @@ -1,8 +1,35 @@ Runtime Options =============== -Library / Executables ---------------------- +Output file partitions (subfiles) +--------------------------------- + +If you're running on an IBM BG/Q supercomputer, then the number of subfiles +(partitions) chosen is based on the I/O nodes in an automatic way. Otherwise, by +default, the GenericIO library picks the number of subfiles based on a +fairly-naive hostname-based hashing scheme. This works reasonably-well on small +clusters, but not on larger systems. On a larger system, you might want to set +these environmental variables: + +.. code-block:: bash + + GENERICIO_PARTITIONS_USE_NAME=0 + GENERICIO_RANK_PARTITIONS=256 + +Where the number of partitions (256 above) determines the number of subfiles +used. If you're using a Lustre file system, for example, an optimal number of +files is: + +.. code-block:: + + # of files * stripe count ~ # OSTs + +On Titan, for example, there are 1008 OSTs, and a default stripe count of 4, so +we use approximately 256 files. + + +All GenericIO Flags +------------------- ``GENERICIO_RANK0_CREATE_ALL`` @@ -20,13 +47,10 @@ Library / Executables ``GENERICIO_RANK_PARTITIONS`` -Executables ------------ - ``GENERICIO_USE_MPIIO`` -BLOSC ------ +BLOSC Flags +----------- ``BLOSC_CLEVEL`` @@ -40,4 +64,4 @@ BLOSC ``BLOSC_NTHREADS`` -``BLOSC_NOLOCK`` \ No newline at end of file +``BLOSC_NOLOCK`` diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 99f05bcbc6a0c3ee2a179c826b2df7f4cbfa447d..ab2153b297dccd5906d270f608b28a7ce145dbe5 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -5,7 +5,7 @@ include(FetchContent) FetchContent_Declare( pybind11 GIT_REPOSITORY https://github.com/pybind/pybind11.git - GIT_TAG v2.9.1 + GIT_TAG v2.9.2 ) FetchContent_GetProperties(pybind11) diff --git a/python/genericio.cpp b/python/genericio.cpp index 53df6f5502cec0c448fe6eb2b86e38315c10fc57..a29d478f35d38ecbcf05d88bb8237410cc05dcaf 100644 --- a/python/genericio.cpp +++ b/python/genericio.cpp @@ -8,6 +8,7 @@ #include <map> #include <cstdint> #include <optional> +#include <stdexcept> #ifndef GENERICIO_NO_MPI #include <mpi.h> @@ -119,6 +120,10 @@ public: std::map<std::string, py::array> result; for(const std::string& var_name: *var_names) { + // check if it's not already defined (caused some segfaults at some point) + if(result.count(var_name)) { + throw std::invalid_argument(std::string("variable name was passed multiple times: ") + var_name); + } auto varp = std::find_if( variables.begin(), variables.end(), @@ -127,24 +132,28 @@ public: if (varp != variables.end()) { // extra space py::ssize_t readsize = num_elem + requestedExtraSpace()/(*varp).ElementSize; + // py::array_t constructor: (shape, stride) -> some compilers or numpy + // versions require stride to be explicitly specified if((*varp).IsFloat && (*varp).ElementSize == 4) { - result[var_name] = py::array_t<float>(readsize); + result[var_name] = py::array_t<float>({readsize}, {4}); addVariable(*varp, result[var_name].mutable_data(), gio::GenericIO::VarHasExtraSpace); } else if((*varp).IsFloat && (*varp).ElementSize == 8) { - result[var_name] = py::array_t<double>(readsize); + result[var_name] = py::array_t<double>({readsize}, {8}); addVariable(*varp, result[var_name].mutable_data(), gio::GenericIO::VarHasExtraSpace); } else if(!(*varp).IsFloat && (*varp).ElementSize == 4) { - result[var_name] = py::array_t<int32_t>(readsize); + result[var_name] = py::array_t<int32_t>({readsize}, {4}); addVariable(*varp, result[var_name].mutable_data(), gio::GenericIO::VarHasExtraSpace); } else if(!(*varp).IsFloat && (*varp).ElementSize == 8) { - result[var_name] = py::array_t<int64_t>(readsize); + result[var_name] = py::array_t<int64_t>({readsize}, {8}); addVariable(*varp, result[var_name].mutable_data(), gio::GenericIO::VarHasExtraSpace); } else if(!(*varp).IsFloat && (*varp).ElementSize == 2) { - result[var_name] = py::array_t<uint16_t>(readsize); + result[var_name] = py::array_t<uint16_t>({readsize}, {2}); addVariable(*varp, result[var_name].mutable_data(), gio::GenericIO::VarHasExtraSpace); } else { - throw std::runtime_error(std::string("Unknown data type in GenericIO for variable ") + var_name); + throw std::runtime_error(std::string("unknown data type in GenericIO for variable: ") + var_name); } + } else { // variable not found + throw std::invalid_argument(std::string("requested variable is not defined in GenericIO file: ") + var_name); } }