cellarr_array.utils package

Submodules

cellarr_array.utils.config module

class cellarr_array.utils.config.CellArrConfig(tile_capacity=100000, cell_order='row-major', tile_order='row-major', coords_filters=<factory>, offsets_filters=<factory>, attrs_filters=<factory>, ctx_config=<factory>)[source]

Bases: object

Configuration class for TileDB array creation and access.

__annotations__ = {'attrs_filters': typing.Dict[str, typing.List[tiledb.filter.Filter]], 'cell_order': <class 'str'>, 'coords_filters': typing.List[tiledb.filter.Filter], 'ctx_config': typing.Dict[str, typing.Any], 'offsets_filters': typing.List[tiledb.filter.Filter], 'tile_capacity': <class 'int'>, 'tile_order': <class 'str'>}
__dataclass_fields__ = {'attrs_filters': Field(name='attrs_filters',type=typing.Dict[str, typing.List[tiledb.filter.Filter]],default=<dataclasses._MISSING_TYPE object>,default_factory=<function CellArrConfig.<lambda>>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD), 'cell_order': Field(name='cell_order',type=<class 'str'>,default='row-major',default_factory=<dataclasses._MISSING_TYPE object>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD), 'coords_filters': Field(name='coords_filters',type=typing.List[tiledb.filter.Filter],default=<dataclasses._MISSING_TYPE object>,default_factory=<function CellArrConfig.<lambda>>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD), 'ctx_config': Field(name='ctx_config',type=typing.Dict[str, typing.Any],default=<dataclasses._MISSING_TYPE object>,default_factory=<class 'dict'>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD), 'offsets_filters': Field(name='offsets_filters',type=typing.List[tiledb.filter.Filter],default=<dataclasses._MISSING_TYPE object>,default_factory=<function CellArrConfig.<lambda>>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD), 'tile_capacity': Field(name='tile_capacity',type=<class 'int'>,default=100000,default_factory=<dataclasses._MISSING_TYPE object>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD), 'tile_order': Field(name='tile_order',type=<class 'str'>,default='row-major',default_factory=<dataclasses._MISSING_TYPE object>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD)}
__dataclass_params__ = _DataclassParams(init=True,repr=True,eq=True,order=False,unsafe_hash=False,frozen=False)
__eq__(other)

Return self==value.

__hash__ = None
__init__(tile_capacity=100000, cell_order='row-major', tile_order='row-major', coords_filters=<factory>, offsets_filters=<factory>, attrs_filters=<factory>, ctx_config=<factory>)
__match_args__ = ('tile_capacity', 'cell_order', 'tile_order', 'coords_filters', 'offsets_filters', 'attrs_filters', 'ctx_config')
__post_init__()[source]

Convert filter configurations to TileDB Filter objects.

__repr__()

Return repr(self).

attrs_filters: Dict[str, List[Filter]]
cell_order: str = 'row-major'
coords_filters: List[Filter]
static create_filter(filter_config)[source]

Create a TileDB Filter object from configuration.

Return type:

Filter

ctx_config: Dict[str, Any]
offsets_filters: List[Filter]
tile_capacity: int = 100000
tile_order: str = 'row-major'
class cellarr_array.utils.config.ConsolidationConfig(steps=100000, step_min_frags=2, step_max_frags=10, buffer_size=15000000000, total_budget=40000000000, num_threads=4, vacuum_after=True)[source]

Bases: object

Configuration for array consolidation.

__annotations__ = {'buffer_size': <class 'int'>, 'num_threads': <class 'int'>, 'step_max_frags': <class 'int'>, 'step_min_frags': <class 'int'>, 'steps': <class 'int'>, 'total_budget': <class 'int'>, 'vacuum_after': <class 'bool'>}
__dataclass_fields__ = {'buffer_size': Field(name='buffer_size',type=<class 'int'>,default=15000000000,default_factory=<dataclasses._MISSING_TYPE object>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD), 'num_threads': Field(name='num_threads',type=<class 'int'>,default=4,default_factory=<dataclasses._MISSING_TYPE object>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD), 'step_max_frags': Field(name='step_max_frags',type=<class 'int'>,default=10,default_factory=<dataclasses._MISSING_TYPE object>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD), 'step_min_frags': Field(name='step_min_frags',type=<class 'int'>,default=2,default_factory=<dataclasses._MISSING_TYPE object>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD), 'steps': Field(name='steps',type=<class 'int'>,default=100000,default_factory=<dataclasses._MISSING_TYPE object>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD), 'total_budget': Field(name='total_budget',type=<class 'int'>,default=40000000000,default_factory=<dataclasses._MISSING_TYPE object>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD), 'vacuum_after': Field(name='vacuum_after',type=<class 'bool'>,default=True,default_factory=<dataclasses._MISSING_TYPE object>,init=True,repr=True,hash=None,compare=True,metadata=mappingproxy({}),kw_only=False,_field_type=_FIELD)}
__dataclass_params__ = _DataclassParams(init=True,repr=True,eq=True,order=False,unsafe_hash=False,frozen=False)
__eq__(other)

Return self==value.

__hash__ = None
__init__(steps=100000, step_min_frags=2, step_max_frags=10, buffer_size=15000000000, total_budget=40000000000, num_threads=4, vacuum_after=True)
__match_args__ = ('steps', 'step_min_frags', 'step_max_frags', 'buffer_size', 'total_budget', 'num_threads', 'vacuum_after')
__repr__()

Return repr(self).

buffer_size: int = 15000000000
num_threads: int = 4
step_max_frags: int = 10
step_min_frags: int = 2
steps: int = 100000
total_budget: int = 40000000000
vacuum_after: bool = True

cellarr_array.utils.mock module

cellarr_array.utils.mock.generate_tiledb_dense_array(uri, rows, cols, attr_name='data', attr_dtype=<class 'numpy.float32'>, chunk_size=1000, tiledb_config=None)[source]

Generates a dense TileDB array and fills it with random float32 data.

Parameters:
  • uri (str) – URI for the new TileDB array.

  • rows (int) – Number of rows.

  • cols (int) – Number of columns (features).

  • attr_name (str) – Name of the attribute.

  • attr_dtype (dtype) – Data type of the attribute.

  • chunk_size (int) – Number of rows to write per batch.

  • tiledb_config (Optional[Dict]) – TileDB context configuration.

cellarr_array.utils.mock.generate_tiledb_sparse_array(uri, rows, cols, density=0.01, attr_name='data', attr_dtype=<class 'numpy.float32'>, chunk_size=1000, tiledb_config=None, sparse_format_to_write='coo')[source]

Generates a sparse TileDB array and fills it with random float32 data.

Parameters:
  • uri (str) – URI for the new TileDB array.

  • rows (int) – Number of rows.

  • cols (int) – Number of columns (features).

  • density (float) – Density of the sparse matrix.

  • attr_name (str) – Name of the attribute.

  • attr_dtype (dtype) – Data type of the attribute.

  • chunk_size (int) – Number of rows to generate and write per batch.

  • tiledb_configs – TileDB context configuration.

  • sparse_format_to_write – Scipy sparse format to use for generating chunks (‘coo’, ‘csr’, ‘csc’).

Module contents