Source code for genomicarrays.build_options

from dataclasses import dataclass
from typing import Dict, Literal, Optional, Callable

import numpy as np

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


[docs] @dataclass class MatrixOptions: """Optional arguments for the ``matrix`` store for :py:func:`~genomicarrays.build_genomicarray.build_genomicarray`. Attributes: matrix_attr_name: Name of the matrix to be stored in the TileDB file. Defaults to "data". skip: Whether to skip generating matrix TileDB. Defaults to False. dtype: NumPy dtype for the values in the matrix. Defaults to np.uint16. Note: make sure the matrix values fit within the range limits of chosen-dtype. tiledb_store_name: Name of the TileDB file. Defaults to `coverage`. chunk_size: Size of chunks for parallel processing. compression: TileDB compression filter (None, 'gzip', 'zstd', 'lz4'). compression_level: Compression level (1-9). """ skip: bool = False matrix_attr_name: str = "data" dtype: np.dtype = np.float32 tiledb_store_name: str = "coverage" chunk_size: int = 1000 compression: Literal["zstd", "gzip", "lz4"] = "zstd" compression_level: int = 4
[docs] def __post_init__(self): """Validate configuration.""" if self.compression not in {"zstd", "gzip", "lz4", None}: raise ValueError(f"Unsupported compression: {self.compression}") if not 1 <= self.compression_level <= 9: raise ValueError(f"Invalid compression level: {self.compression_level}")
[docs] @dataclass class SampleMetadataOptions: """Optional arguments for the ``sample`` store for :py:func:`~genomicarrays.build_genomicarray.build_genomicarray`. Attributes: skip: Whether to skip generating sample TileDB. Defaults to False. dtype: NumPy dtype for the sample dimension. Defaults to np.uint32. Note: make sure the number of samples fit within the integer limits of chosen dtype. tiledb_store_name: Name of the TileDB file. Defaults to "sample_metadata". column_types: A dictionary containing column names as keys and the value representing the type to in the TileDB. If `None`, all columns are cast as 'ascii'. """ skip: bool = False dtype: np.dtype = np.uint32 tiledb_store_name: str = "sample_metadata" column_types: Dict[str, np.dtype] = None
[docs] @dataclass class FeatureAnnotationOptions: """Optional arguments for the ``feature`` store for :py:func:`~genomicarrays.build_genomicarray.build_genomicarray`. Attributes: skip: Whether to skip generating sample TileDB. Defaults to False. dtype: NumPy dtype for the sample dimension. Defaults to np.uint32. Note: make sure the number of features fit within the integer limits of chosen dtype. tiledb_store_name: Name of the TileDB file. Defaults to "feature_annotation". column_types: A dictionary containing column names as keys and the value representing the type to in the TileDB. If `None`, all columns are cast as 'ascii'. aggregate_function: A callable to summarize the values in a given interval. The aggregate function is expected to return either a scalar value or a 1-dimensional NumPy `ndarray`. Defaults to None. expected_agg_function_length: Length of the output when a agg function is applied to an interval. Defaults to 1, expecting a scalar. Note: `ndarrays` will be flattenned before writing to TileDB. """ skip: bool = False dtype: np.dtype = np.uint32 tiledb_store_name: str = "feature_annotation" column_types: Dict[str, np.dtype] = None aggregate_function: Optional[Callable] = None expected_agg_function_length: int = 1
[docs] def __post_init__(self): if self.column_types is None: self.column_types = {"seqnames": "ascii", "starts": "int", "ends": "int"}