cellarr-se Demo¶
cellarr-se is a read-only coordinator for TileDB-backed datasets that returns
standard SummarizedExperiment objects when sliced.
Key features:
Lazy loading from TileDB arrays
String-indexed frames for name-based slicing
TileDB query filtering
Returns in-memory
SummarizedExperimenton slice
import os
import shutil
import numpy as np
import pandas as pd
from cellarr_se import CellArraySE
from cellarr_array import create_cellarray
from cellarr_frame import CellArrayFrame
# Setup
DATA_DIR = os.path.abspath("../files/demo")
if os.path.exists(DATA_DIR):
shutil.rmtree(DATA_DIR)
os.makedirs(DATA_DIR)
N_GENES = 100
N_SAMPLES = 20
1. Create TileDB-backed Components¶
# Create assays (counts and log-normalized counts)
np.random.seed(42)
# Raw counts
counts_data = np.random.randint(0, 1000, (N_GENES, N_SAMPLES)).astype(np.float64)
counts = create_cellarray(
uri=os.path.join(DATA_DIR, "counts.tdb"),
shape=(N_GENES, N_SAMPLES),
attr_dtype=np.float64,
sparse=False,
)
counts.write_batch(counts_data, start_row=0)
# Log-normalized counts: log1p(counts / size_factor * 10000)
size_factors = counts_data.sum(axis=0) / 1e4
logcounts_data = np.log1p(counts_data / size_factors)
logcounts = create_cellarray(
uri=os.path.join(DATA_DIR, "logcounts.tdb"),
shape=(N_GENES, N_SAMPLES),
attr_dtype=np.float64,
sparse=False,
)
logcounts.write_batch(logcounts_data, start_row=0)
print(f"Counts array: {counts.shape}")
print(f"Logcounts array: {logcounts.shape}")
Counts array: (100, 20)
Logcounts array: (100, 20)
# Create metadata frames with STRING INDICES
# This enables name-based slicing and query filtering
gene_ids = [f"ENSG{i:05d}" for i in range(N_GENES)]
sample_ids = [f"SAMPLE_{i:03d}" for i in range(N_SAMPLES)]
# Row metadata (genes)
row_df = pd.DataFrame(
{
"gene_name": [f"Gene_{i}" for i in range(N_GENES)],
"gene_type": np.random.choice(["protein_coding", "lncRNA", "pseudogene"], N_GENES),
},
index=gene_ids,
)
row_df.index.name = "gene_id"
row_frame = CellArrayFrame.create(os.path.join(DATA_DIR, "row_data.tdb"), row_df)
print(f"Row frame: {len(row_frame[:].index)} rows, columns: {row_frame.column_names}")
print(row_frame[:].head())
Row frame: 100 rows, columns: ['gene_name', 'gene_type']
gene_name gene_type
gene_id
ENSG00000 Gene_0 lncRNA
ENSG00001 Gene_1 pseudogene
ENSG00002 Gene_2 pseudogene
ENSG00003 Gene_3 lncRNA
ENSG00004 Gene_4 pseudogene
# Column metadata (samples)
col_df = pd.DataFrame(
{
"tissue": np.random.choice(["liver", "kidney", "brain", "heart"], N_SAMPLES),
"treatment": np.random.choice(["control", "treated"], N_SAMPLES),
},
index=sample_ids,
)
col_df.index.name = "sample_id"
col_frame = CellArrayFrame.create(os.path.join(DATA_DIR, "col_data.tdb"), col_df)
print(f"Col frame: {len(col_frame[:].index)} rows, columns: {col_frame.column_names}")
print(col_frame[:])
Col frame: 20 rows, columns: ['tissue', 'treatment']
tissue treatment
sample_id
SAMPLE_000 kidney control
SAMPLE_001 liver control
SAMPLE_002 brain control
SAMPLE_003 liver control
SAMPLE_004 kidney treated
SAMPLE_005 kidney treated
SAMPLE_006 brain control
SAMPLE_007 kidney control
SAMPLE_008 liver control
SAMPLE_009 kidney treated
SAMPLE_010 liver control
SAMPLE_011 liver treated
SAMPLE_012 kidney treated
SAMPLE_013 brain treated
SAMPLE_014 heart control
SAMPLE_015 kidney treated
SAMPLE_016 liver control
SAMPLE_017 liver treated
SAMPLE_018 kidney control
SAMPLE_019 brain treated
2. Initialize CellArraySE¶
se = CellArraySE(
assays={"counts": counts, "logcounts": logcounts},
row_data=row_frame,
col_data=col_frame,
)
print(se)
se.show(n=5)
<CellArraySE: 100x20 | counts, logcounts>
CellArraySE Object | 100 rows x 20 cols
Assays: counts, logcounts
--- Row Data ---
gene_name gene_type
gene_id
ENSG00000 Gene_0 lncRNA
ENSG00001 Gene_1 pseudogene
ENSG00002 Gene_2 pseudogene
ENSG00003 Gene_3 lncRNA
ENSG00004 Gene_4 pseudogene
--- Column Data ---
tissue treatment
sample_id
SAMPLE_000 kidney control
SAMPLE_001 liver control
SAMPLE_002 brain control
SAMPLE_003 liver control
SAMPLE_004 kidney treated
3. Properties¶
print(f"Shape: {se.shape}")
print(f"Assays: {se.assay_names}")
print(f"Row columns: {se.row_columns}")
print(f"Col columns: {se.col_columns}")
print(f"\nRow names (first 5): {list(se.row_names[:5])}")
print(f"Col names (first 5): {list(se.col_names[:5])}")
Shape: (100, 20)
Assays: ['counts', 'logcounts']
Row columns: ['gene_name', 'gene_type']
Col columns: ['tissue', 'treatment']
Row names (first 5): ['ENSG00000', 'ENSG00001', 'ENSG00002', 'ENSG00003', 'ENSG00004']
Col names (first 5): ['SAMPLE_000', 'SAMPLE_001', 'SAMPLE_002', 'SAMPLE_003', 'SAMPLE_004']
4. Slicing¶
Two ways to slice:
se[rows, cols]- bracket notationse.slice(...)- method with query support
Both return a SummarizedExperiment object.
# Positional slicing
subset = se[0:10, 0:5]
print(f"se[0:10, 0:5] -> {type(subset).__name__} {subset.shape}")
se[0:10, 0:5] -> SummarizedExperiment (10, 5)
# Slicing by NAME (requires string-indexed frames)
subset = se[["ENSG00001", "ENSG00010", "ENSG00050"], ["SAMPLE_000", "SAMPLE_005"]]
print(f"Sliced by name: {subset.shape}")
print(f"Row names: {list(subset.row_names)}")
print(f"Col names: {list(subset.column_names)}")
Sliced by name: (3, 2)
Row names: ['ENSG00001', 'ENSG00010', 'ENSG00050']
Col names: ['SAMPLE_000', 'SAMPLE_005']
5. Query Filtering¶
Use TileDB query strings to filter rows/columns. Requires string-indexed frames.
# Filter by gene_type
subset = se.slice(row_query="gene_type == 'protein_coding'")
print(f"Protein-coding genes: {subset.shape[0]}")
print(f"Row names (first 5): {list(subset.row_names[:5])}")
Protein-coding genes: 35
Row names (first 5): ['ENSG00007', 'ENSG00011', 'ENSG00014', 'ENSG00021', 'ENSG00022']
# Filter by tissue
subset = se.slice(col_query="tissue == 'liver'")
print(f"Liver samples: {subset.shape[1]}")
print(f"Col names: {list(subset.column_names)}")
Liver samples: 7
Col names: ['SAMPLE_001', 'SAMPLE_003', 'SAMPLE_008', 'SAMPLE_010', 'SAMPLE_011', 'SAMPLE_016', 'SAMPLE_017']
# Combined query
subset = se.slice(
row_query="gene_type == 'protein_coding'",
col_query="treatment == 'control'",
)
print(f"Protein-coding genes in control samples: {subset.shape}")
Protein-coding genes in control samples: (35, 11)
6. Access Assay Data¶
# Get a subset and access the assay data
subset = se[0:5, 0:3]
print(f"Assay names: {subset.assay_names}")
print("\nCounts (raw):")
print(subset.assays["counts"])
print("\nLogcounts (log-normalized):")
print(subset.assays["logcounts"])
Assay names: ['counts', 'logcounts']
Counts (raw):
[[102. 435. 860.]
[661. 308. 769.]
[ 58. 510. 681.]
[646. 20. 840.]
[508. 775. 942.]]
Logcounts (log-normalized):
[[3.12915439 4.45202228 5.14963962]
[4.96023053 4.1115703 5.03848449]
[2.59727886 4.60937154 4.91779329]
[4.93743898 1.58900324 5.12624724]
[4.69906758 5.02441301 5.24020736]]
# Assay introspection (no data loaded)
print(f"counts is sparse: {se.is_sparse('counts')}")
print(f"counts dtype: {se.get_assay_type('counts')}")
counts is sparse: False
counts dtype: float64
Cleanup¶
shutil.rmtree(DATA_DIR)