Source code for genomicarrays.GenomicArrayDataset

"""Query the `GenomicArrayDataset`.

This class provides methods to access the directory containing the
generated TileDB files usually using the
:py:func:`~genomicarray.build_genomicarray.build_genomicarray`.

Example:

    .. code-block:: python

        from genomicarray import (
            GenomicArrayDataset,
        )

        garr = GenomicArrayDataset(
            dataset_path="/path/to/genomicarray/dir"
        )
        result1 = garr[
            0:10, 0
        ]

        print(result1)
"""

import os
from typing import List, Sequence, Union

import pandas as pd
import tiledb

from . import queryutils_tiledb_frame as qtd
from .GenomicArrayDatasetSlice import GenomicArrayDatasetSlice

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


[docs] class GenomicArrayDataset: """A class that represent a collection of features and their associated coverage in a TileDB backed store."""
[docs] def __init__( self, dataset_path: str, matrix_tdb_uri: str = "coverage", feature_annotation_uri: str = "feature_annotation", sample_metadata_uri: str = "sample_metadata", ): """Initialize a ``GenomicArrayDataset``. Args: dataset_path: Path to the directory containing the TileDB stores. Usually the ``output_path`` from the :py:func:`~genomicarray.build_genomicarray.build_genomicarray`. matrix_tdb_uri: Relative path to matrix store. feature_annotation_uri: Relative path to feature annotation store. sample_metadata_uri: Relative path to sample metadata store. """ if not os.path.isdir(dataset_path): raise ValueError("'dataset_path' is not a directory.") self._dataset_path = dataset_path # TODO: Maybe switch to on-demand loading of these objects self._matrix_tdb_tdb = tiledb.open(f"{dataset_path}/{matrix_tdb_uri}", "r") self._feature_annotation_tdb = tiledb.open(f"{dataset_path}/{feature_annotation_uri}", "r") self._sample_metadata_tdb = tiledb.open(f"{dataset_path}/{sample_metadata_uri}", "r")
[docs] def __del__(self): self._matrix_tdb_tdb.close() self._feature_annotation_tdb.close() self._sample_metadata_tdb.close()
#### ## Subset methods for the `feature_annotation` TileDB file. ####
[docs] def get_feature_annotation_columns(self) -> List[str]: """Get annotation column names from ``feature_annotation`` store. Returns: List of available annotations. """ return qtd.get_schema_names_frame(self._feature_annotation_tdb)
[docs] def get_feature_annotation_column(self, column_name: str) -> pd.DataFrame: """Access a column from the ``feature_annotation`` store. Args: column_name: Name of the column or attribute. Usually one of the column names from of :py:meth:`~get_feature_annotation_columns`. Returns: A list of values for this column. """ res = qtd.get_a_column(self._feature_annotation_tdb, column_name=column_name) return res[column_name]
[docs] def get_feature_annotation_index(self) -> List[str]: """Get index of the ``feature_annotation`` store. Returns: List of feature ids. """ res = qtd.get_a_column(self._feature_annotation_tdb, "genarr_feature_index") return res["genarr_feature_index"].tolist()
[docs] def get_feature_subset(self, subset: Union[slice, List[str], tiledb.QueryCondition], columns=None) -> pd.DataFrame: """Slice the ``feature_annotation`` store. Args: subset: A list of integer indices to subset the ``feature_annotation`` store. Alternatively, may provide a :py:class:`tiledb.QueryCondition` to query the store. Alternatively, may provide a list of strings to match with the index of ``feature_annotation`` store. columns: List of specific column names to access. Defaults to None, in which case all columns are extracted. Returns: A pandas Dataframe of the subset. """ if isinstance(columns, str): columns = [columns] if columns is None: columns = self.get_feature_annotation_columns() else: _not_avail = [] for col in columns: if col not in self.get_feature_annotation_columns(): _not_avail.append(col) if len(_not_avail) > 0: raise ValueError(f"Columns '{', '.join(_not_avail)}' are not available.") if qtd._is_list_strings(subset): subset = self._get_indices_for_gene_list(subset) return qtd.subset_frame(self._feature_annotation_tdb, subset=subset, columns=columns)
#### ## Subset methods for the `sample_metadata` TileDB file. ####
[docs] def get_sample_metadata_columns(self) -> List[str]: """Get column names from ``sample_metadata`` store. Returns: List of available metadata columns. """ return qtd.get_schema_names_frame(self._sample_metadata_tdb)
[docs] def get_sample_metadata_column(self, column_name: str) -> pd.DataFrame: """Access a column from the ``sample_metadata`` store. Args: column_name: Name of the column or attribute. Usually one of the column names from of :py:meth:`~get_sample_metadata_columns`. Returns: A list of values for this column. """ res = qtd.get_a_column(self._sample_metadata_tdb, column_name=column_name) return res[column_name]
[docs] def get_sample_subset(self, subset: Union[slice, tiledb.QueryCondition], columns=None) -> pd.DataFrame: """Slice the ``sample_metadata`` store. Args: subset: A list of integer indices to subset the ``sample_metadata`` store. Alternatively, may also provide a :py:class:`tiledb.QueryCondition` to query the store. columns: List of specific column names to access. Defaults to None, in which case all columns are extracted. Returns: A pandas Dataframe of the subset. """ if isinstance(columns, str): columns = [columns] if columns is None: columns = self.get_sample_metadata_columns() else: _not_avail = [] for col in columns: if col not in self.get_sample_metadata_columns(): _not_avail.append(col) if len(_not_avail) > 0: raise ValueError(f"Columns '{', '.join(_not_avail)}' are not available.") return qtd.subset_frame(self._sample_metadata_tdb, subset=subset, columns=columns)
#### ## Subset methods for the `matrix` TileDB file. ####
[docs] def get_matrix_subset(self, subset: Union[int, Sequence, tuple]) -> pd.DataFrame: """Slice the ``matrix`` store. Args: subset: Any `slice`supported by TileDB's array slicing. For more info refer to <TileDB docs https://docs.tiledb.com/main/how-to/arrays/reading-arrays/basic-reading>_. Returns: A pandas Dataframe of the subset. """ if isinstance(subset, (str, int)): return qtd.subset_array( self._matrix_tdb_tdb, subset, slice(None), shape=(len(subset), self.shape[1]), ) if isinstance(subset, tuple): if len(subset) == 0: raise ValueError("At least one slicing argument must be provided.") if len(subset) == 1: return qtd.subset_array( self._matrix_tdb_tdb, subset[0], slice(None), shape=(len(subset[0]), self.shape[1]), ) elif len(subset) == 2: return qtd.subset_array( self._matrix_tdb_tdb, subset[0], subset[1], shape=(len(subset[0]), len(subset[1])), ) else: raise ValueError(f"`{type(self).__name__}` only supports 2-dimensional slicing.")
#### ## Subset methods by cell and gene dimensions. ####
[docs] def get_slice( self, feature_subset: Union[slice, int], sample_subset: Union[slice, List[str], tiledb.QueryCondition], ) -> GenomicArrayDatasetSlice: """Subset a ``GenomicArrayDataset``. Args: sample_subset: Integer indices, a boolean filter, or (if the current object is named) names specifying the columns (or samples) to retain. feature_subset: Integer indices, a boolean filter, or (if the current object is named) names specifying the rows (or features/genes) to retain. Returns: A :py:class:`~genomicarray.GenomicArrayDatasetSlice.GenomicArrayDatasetSlice` object containing the `sample_metadata`, `feature_annotation` and the matrix for the given slice ranges. """ _ssubset = self.get_sample_subset(sample_subset) _sample_indices = _ssubset.index.tolist() if not isinstance(feature_subset, (int, slice)): raise TypeError("feature indices must be continous; either a 'slice' or 'int' index.") _fsubset = self.get_feature_subset(feature_subset) start_findex = _fsubset["genarr_feature_start_index"].astype(int).min() end_findex = _fsubset["genarr_feature_end_index"].astype(int).max() # expand intervals # final_rows = [] # for row in _fsubset.itertuples(): # for i, _ in enumerate(range(int(row.genarr_feature_start_index), int(row.genarr_feature_end_index))): # final_rows.append(row._replace(starts=i + row.starts, ends=i + row.starts + 1)) # _feature_df = pd.DataFrame(final_rows) _msubset = self.get_matrix_subset((list(range(start_findex, end_findex)), _sample_indices)) return GenomicArrayDatasetSlice( _ssubset, _fsubset, _msubset, )
#### ## Dunder method to use `[]` operator. ####
[docs] def __getitem__( self, args: Union[int, Sequence, tuple], ) -> GenomicArrayDatasetSlice: """Subset a ``GenomicArrayDataset``. Mostly an alias to :py:meth:`~.get_slice`. Args: args: Integer indices, a boolean filter, or (if the current object is named) names specifying the ranges to be extracted. Alternatively a tuple of length 1. The first entry specifies the rows (or cells) to retain based on their names or indices. Alternatively a tuple of length 2. The first entry specifies the rows (or cells) to retain, while the second entry specifies the columns (or features/genes) to retain, based on their names or indices. Raises: ValueError: If too many or too few slices provided. Returns: A :py:class:`~genomicarray.GenomicArrayDatasetSlice.GenomicArrayDatasetSlice` object containing the `sample_metadata`, `feature_annotation` and the matrix. """ if isinstance(args, (str, int)): return self.get_slice(args, slice(None)) if isinstance(args, tuple): if len(args) == 0: raise ValueError("At least one slicing argument must be provided.") if len(args) == 1: return self.get_slice(args[0], slice(None)) elif len(args) == 2: return self.get_slice(args[0], args[1]) else: raise ValueError(f"`{type(self).__name__}` only supports 2-dimensional slicing.") raise TypeError("args must be a sequence or a scalar integer or string or a tuple of atmost 2 values.")
#### ## Misc methods. #### @property def shape(self): return ( self._feature_annotation_tdb.nonempty_domain()[0][1] + 1, self._sample_metadata_tdb.nonempty_domain()[0][1] + 1, )
[docs] def __len__(self): return self.shape[0]
#### ## Printing. ####
[docs] def __repr__(self) -> str: """ Returns: A string representation. """ output = f"{type(self).__name__}(number_of_rows={self.shape[0]}" output += f", number_of_columns={self.shape[1]}" output += ", at path=" + self._dataset_path output += ")" return output
def __str__(self) -> str: """ Returns: A pretty-printed string containing the contents of this object. """ output = f"class: {type(self).__name__}\n" output += f"number_of_rows: {self.shape[0]}\n" output += f"number_of_columns: {self.shape[1]}\n" output += f"path: '{self._dataset_path}'\n" return output