#ifndef TATAMI_HDF5_DENSE_MATRIX_HPP #define TATAMI_HDF5_DENSE_MATRIX_HPP #include "H5Cpp.h" #include #include #include #include #include "../base/Matrix.hpp" /** * @file HDF5DenseMatrix.hpp * * @brief Defines a class for a HDF5-backed dense matrix. */ namespace tatami { /** * @cond */ namespace HDF5 { template const H5::PredType& define_mem_type() { if constexpr(std::is_same::value) { return H5::PredType::NATIVE_INT; } else if (std::is_same::value) { return H5::PredType::NATIVE_UINT; } else if (std::is_same::value) { return H5::PredType::NATIVE_LONG; } else if (std::is_same::value) { return H5::PredType::NATIVE_ULONG; } else if (std::is_same::value) { return H5::PredType::NATIVE_LLONG; } else if (std::is_same::value) { return H5::PredType::NATIVE_ULLONG; } else if (std::is_same::value) { return H5::PredType::NATIVE_SHORT; } else if (std::is_same::value) { return H5::PredType::NATIVE_USHORT; } else if (std::is_same::value) { return H5::PredType::NATIVE_CHAR; } else if (std::is_same::value) { return H5::PredType::NATIVE_UCHAR; } else if (std::is_same::value) { return H5::PredType::NATIVE_DOUBLE; } else if (std::is_same::value) { return H5::PredType::NATIVE_FLOAT; } else if (std::is_same::value) { return H5::PredType::NATIVE_LDOUBLE; } else if (std::is_same::value) { return H5::PredType::NATIVE_UINT8; } else if (std::is_same::value) { return H5::PredType::NATIVE_INT8; } else if (std::is_same::value) { return H5::PredType::NATIVE_UINT16; } else if (std::is_same::value) { return H5::PredType::NATIVE_INT16; } else if (std::is_same::value) { return H5::PredType::NATIVE_UINT32; } else if (std::is_same::value) { return H5::PredType::NATIVE_INT32; } else if (std::is_same::value) { return H5::PredType::NATIVE_UINT64; } else if (std::is_same::value) { return H5::PredType::NATIVE_INT64; } static_assert("unsupported HDF5 type for template parameter 'T'"); } } /** * @endcond */ /** * @brief Dense matrix backed by a DataSet in a HDF5 file. * * This class retrieves data from the HDF5 file on demand rather than loading it all in at the start. * This allows us to handle very large datasets in limited memory at the cost of some speed. * * We manually handle the chunk caching to speed up access for consecutive rows and columns. * The policy is to minimize the number of calls to the HDF5 library by requesting large contiguous slices where possible. * The size of the slice is determined by the cache limit in the constructor. * * Callers should follow the `prefer_rows()` suggestion when extracting data, * as this tries to minimize the number of chunks that need to be read per access request. * If they do not, the access pattern on disk may be slightly to highly suboptimal, depending on the chunk dimensions. * * As the HDF5 library is not generally thread-safe, the HDF5-related operations should only be run in a single thread. * For OpenMP, this is handled automatically by putting all HDF5 operations in a critical region. * For other parallelization schemes, callers should define the `TATAMI_HDF5_PARALLEL_LOCK` macro; * this should be a function that accepts and executes a no-argument lambda within an appropriate serial region (e.g., based on a global mutex). * * @tparam T Type of the matrix values. * @tparam IDX Type of the row/column indices. * @tparam transpose Whether the dataset is transposed in its storage order, i.e., rows in HDF5 are columns in this matrix. */ template class HDF5DenseMatrix : public tatami::Matrix { size_t firstdim, seconddim; std::string file_name, dataset_name; hsize_t chunk_firstdim, chunk_seconddim; hsize_t cache_firstdim, cache_seconddim; bool prefer_firstdim; public: /** * @param file Path to the file. * @param name Path to the dataset inside the file. * @param cache_limit Limit to the size of the chunk cache, in bytes. * * The cache size should be large enough to fit all chunks spanned by a row or column, for (near-)consecutive row and column access respectively. * Otherwise, performance will degrade as the same chunks may need to be repeatedly read back into memory. */ HDF5DenseMatrix(std::string file, std::string name, size_t cache_limit = 100000000) : file_name(std::move(file)), dataset_name(std::move(name)) { #ifndef TATAMI_HDF5_PARALLEL_LOCK #pragma omp critical { #else TATAMI_HDF5_PARALLEL_LOCK([&]() -> void { #endif H5::H5File fhandle(file_name, H5F_ACC_RDONLY); auto dhandle = fhandle.openDataSet(dataset_name); auto space = dhandle.getSpace(); int ndim = space.getSimpleExtentNdims(); if (ndim != 2) { throw std::runtime_error("data in HDF5 dataset is not a two-dimensional array"); } hsize_t dims_out[2]; space.getSimpleExtentDims(dims_out, NULL); firstdim = dims_out[0]; seconddim = dims_out[1]; auto curtype = dhandle.getTypeClass(); if (curtype != H5T_INTEGER && curtype != H5T_FLOAT) { throw std::runtime_error("expected numeric data in the HDF5 dataset"); } auto dparms = dhandle.getCreatePlist(); if (dparms.getLayout() != H5D_CHUNKED) { // If contiguous, each firstdim is treated as a chunk. chunk_firstdim = 1; chunk_seconddim = seconddim; } else { hsize_t chunk_dims[2]; dparms.getChunk(2, chunk_dims); chunk_firstdim = chunk_dims[0]; chunk_seconddim = chunk_dims[1]; } auto limit_cache = [](size_t chunk_dim, size_t mydim, size_t otherdim, size_t type_size, size_t limit) -> size_t { size_t nelements_along_chunkdim = std::min(mydim, limit / (type_size * otherdim)); size_t nchunks = nelements_along_chunkdim / chunk_dim; return std::min(nelements_along_chunkdim, nchunks * chunk_dim); }; size_t data_size = dhandle.getDataType().getSize(); cache_firstdim = limit_cache(chunk_firstdim, firstdim, seconddim, data_size, cache_limit); cache_seconddim = limit_cache(chunk_seconddim, seconddim, firstdim, data_size, cache_limit); // Favoring extraction along the first dimension if it's not crippled // by the chunking. This favors pulling of contiguous sections along // the last (i.e., second) dimension, which is how HDF5 stores and // returns 2D arrays, so we avoid needing some manual transposition. if (cache_firstdim >= chunk_firstdim) { prefer_firstdim = true; } else if (cache_seconddim >= chunk_seconddim) { // Of course, if the first dimension is chunk-compromised, then we // would rather do the transpositions than repeat a read from file. prefer_firstdim = false; } else { // If both are crippled, then we might as well extract along the first dimension. prefer_firstdim = true; } #ifndef TATAMI_HDF5_PARALLEL_LOCK } #else }); #endif return; } public: size_t nrow() const { if constexpr(transpose) { return seconddim; } else { return firstdim; } } size_t ncol() const { if constexpr(transpose) { return firstdim; } else { return seconddim; } } /** * @return Boolean indicating whether to prefer row extraction. * * We favor extraction on the first dimension (rows by default, columns when `transpose = true`) as this matches the HDF5 storage order. * However, for some chunking scheme and `cache_limit`, this might require repeated reads from file; * in such cases, we switch to extraction on the second dimension. */ bool prefer_rows() const { if constexpr(transpose) { return !prefer_firstdim; } else { return prefer_firstdim; } } public: /** * @cond */ struct HDF5DenseWorkspace : public Workspace { H5::H5File file; H5::DataSet dataset; H5::DataSpace dataspace; H5::DataSpace memspace; std::vector cache, buffer; size_t cached_chunk = -1; hsize_t cached_first = 0, cached_last = 0; }; /** * @endcond */ /** * @param row Should a workspace be created for row-wise extraction? * @return A shared pointer to a `Workspace` object is returned. */ std::shared_ptr new_workspace(bool row) const { std::shared_ptr output; #ifndef TATAMI_HDF5_PARALLEL_LOCK #pragma omp critical { #else TATAMI_HDF5_PARALLEL_LOCK([&]() -> void { #endif auto ptr = new HDF5DenseWorkspace; output.reset(ptr); // Turn off HDF5's caching, as we'll be handling that. H5::FileAccPropList fapl(H5::FileAccPropList::DEFAULT.getId()); fapl.setCache(0, 0, 0, 0); ptr->file.openFile(file_name, H5F_ACC_RDONLY, fapl); ptr->dataset = ptr->file.openDataSet(dataset_name); ptr->dataspace = ptr->dataset.getSpace(); #ifndef TATAMI_HDF5_PARALLEL_LOCK } #else }); #endif return output; } private: template const T* extract(size_t i, T* buffer, size_t first, size_t last, H5::DataSet& dataset, H5::DataSpace& dataspace, H5::DataSpace& memspace) const { hsize_t offset[2]; hsize_t count[2]; constexpr int x = (row != transpose); offset[1-x] = i; offset[x] = first; count[1-x] = 1; count[x] = last - first; // Serial locks are applied in callers. dataspace.selectHyperslab(H5S_SELECT_SET, count, offset); memspace.setExtentSimple(2, count); memspace.selectAll(); dataset.read(buffer, HDF5::define_mem_type(), memspace, dataspace); return buffer; } /* We manually handle the chunk caching, in particular so that we only * pay the cost of rearranging the chunked data into contiguous arrays once. */ template const T* extract(size_t i, T* buffer, size_t first, size_t last, HDF5DenseWorkspace& work) const { // Figuring out which chunk the request belongs to. hsize_t cache_mydim, chunk_otherdim, mydim, otherdim; if constexpr(row != transpose) { cache_mydim = cache_firstdim; chunk_otherdim = chunk_seconddim; mydim = firstdim; otherdim = seconddim; } else { cache_mydim = cache_seconddim; chunk_otherdim = chunk_firstdim; mydim = seconddim; otherdim = firstdim; } // No caching can be done here, so we just extract directly. if (cache_mydim == 0) { const T* out; #ifndef TATAMI_HDF5_PARALLEL_LOCK #pragma omp critical { #else TATAMI_HDF5_PARALLEL_LOCK([&]() -> void { #endif out = extract(i, buffer, first, last, work.dataset, work.dataspace, work.memspace); #ifndef TATAMI_HDF5_PARALLEL_LOCK } #else }); #endif return out; } size_t chunk = i / cache_mydim; size_t index = i % cache_mydim; // If we're in the same chunk for 'i' and the 'first'/'last' are within // range, we re-use the cache. In theory, we could minimize // extractions if the 'first' and 'last' are overlapping with their // cached counterparts, but it's pretty rare that 'first' and 'last' // will change, so we'll just keep things simple. if (chunk != work.cached_chunk || first < work.cached_first || last > work.cached_last) { size_t first_chunk = first / chunk_otherdim; // Need the chunk _after_ the chunk containing the _before-last_ element. size_t last_chunk = last ? (last - 1) / chunk_otherdim + 1 : 0; hsize_t new_cached_first = first_chunk * chunk_otherdim; hsize_t new_cached_last = std::min(otherdim, last_chunk * chunk_otherdim); size_t new_span = new_cached_last - new_cached_first; hsize_t cache_mydim_start = chunk * cache_mydim; hsize_t cache_mydim_end = std::min(mydim, cache_mydim_start + cache_mydim); hsize_t cache_mydim_actual = cache_mydim_end - cache_mydim_start; size_t new_cache_size = new_span * cache_mydim; T* destination; work.cache.resize(new_cache_size); if constexpr(row != transpose) { destination = work.cache.data(); } else { work.buffer.resize(new_cache_size); destination = work.buffer.data(); } // For chunks not present in the cache, we need to loop through and read them in. hsize_t offset[2]; hsize_t count[2]; { constexpr int x = (row != transpose); offset[1-x] = cache_mydim_start; offset[x] = new_cached_first; count[1-x] = cache_mydim_actual; count[x] = new_span; } #ifndef TATAMI_HDF5_PARALLEL_LOCK #pragma omp critical { #else TATAMI_HDF5_PARALLEL_LOCK([&]() -> void { #endif work.dataspace.selectHyperslab(H5S_SELECT_SET, count, offset); // HDF5 is a lot faster when the memspace and dataspace match in dimensionality. // Presumably there is some shuffling that happens inside when dimensions don't match. work.memspace.setExtentSimple(2, count); work.memspace.selectAll(); work.dataset.read(destination, HDF5::define_mem_type(), work.memspace, work.dataspace); #ifndef TATAMI_HDF5_PARALLEL_LOCK } #else }); #endif if constexpr(row == transpose) { auto output = work.cache.begin(); for (hsize_t x = 0; x < count[1]; ++x, output += new_span) { auto in = work.buffer.begin() + x; for (hsize_t y = 0; y < count[0]; ++y, in += count[1]) { *(output + y) = *in; } } } work.cached_first = new_cached_first; work.cached_last = new_cached_last; work.cached_chunk = chunk; } auto start = work.cache.begin() + index * (work.cached_last - work.cached_first); std::copy( start + (first - work.cached_first), start + (last - work.cached_first), buffer ); return buffer; } template const T* extract(size_t i, T* buffer, size_t first, size_t last, Workspace* work) const { if (work) { auto wptr = dynamic_cast(work); return extract(i, buffer, first, last, *wptr); } else { const T* out; #ifndef TATAMI_HDF5_PARALLEL_LOCK #pragma omp critical { #else TATAMI_HDF5_PARALLEL_LOCK([&]() -> void { #endif // Bypass all caching, manual and HDF5. H5::FileAccPropList fapl(H5::FileAccPropList::DEFAULT.getId()); fapl.setCache(10000, 0, 0, 0); H5::H5File file(file_name, H5F_ACC_RDONLY, H5::FileCreatPropList::DEFAULT, fapl); auto dataset = file.openDataSet(dataset_name); auto dataspace = dataset.getSpace(); H5::DataSpace memspace; out = extract(i, buffer, first, last, dataset, dataspace, memspace); #ifndef TATAMI_HDF5_PARALLEL_LOCK } #else }); #endif return out; } } public: const T* row(size_t r, T* buffer, size_t first, size_t last, Workspace* work=nullptr) const { return extract(r, buffer, first, last, work); } const T* column(size_t c, T* buffer, size_t first, size_t last, Workspace* work=nullptr) const { return extract(c, buffer, first, last, work); } using Matrix::row; using Matrix::column; }; } #endif