cellarr-se Demo¶

cellarr-se is a read-only coordinator for TileDB-backed datasets that returns standard SummarizedExperiment objects when sliced.

Key features:

Lazy loading from TileDB arrays
String-indexed frames for name-based slicing
TileDB query filtering
Returns in-memory SummarizedExperiment on slice

import os
import shutil
import numpy as np
import pandas as pd

from cellarr_se import CellArraySE
from cellarr_array import create_cellarray
from cellarr_frame import CellArrayFrame

# Setup
DATA_DIR = os.path.abspath("../files/demo")
if os.path.exists(DATA_DIR):
    shutil.rmtree(DATA_DIR)
os.makedirs(DATA_DIR)

N_GENES = 100
N_SAMPLES = 20

1. Create TileDB-backed Components¶

# Create assays (counts and log-normalized counts)
np.random.seed(42)

# Raw counts
counts_data = np.random.randint(0, 1000, (N_GENES, N_SAMPLES)).astype(np.float64)
counts = create_cellarray(
    uri=os.path.join(DATA_DIR, "counts.tdb"),
    shape=(N_GENES, N_SAMPLES),
    attr_dtype=np.float64,
    sparse=False,
)
counts.write_batch(counts_data, start_row=0)

# Log-normalized counts: log1p(counts / size_factor * 10000)
size_factors = counts_data.sum(axis=0) / 1e4
logcounts_data = np.log1p(counts_data / size_factors)
logcounts = create_cellarray(
    uri=os.path.join(DATA_DIR, "logcounts.tdb"),
    shape=(N_GENES, N_SAMPLES),
    attr_dtype=np.float64,
    sparse=False,
)
logcounts.write_batch(logcounts_data, start_row=0)

print(f"Counts array: {counts.shape}")
print(f"Logcounts array: {logcounts.shape}")

Counts array: (100, 20)
Logcounts array: (100, 20)

# Create metadata frames with STRING INDICES
# This enables name-based slicing and query filtering

gene_ids = [f"ENSG{i:05d}" for i in range(N_GENES)]
sample_ids = [f"SAMPLE_{i:03d}" for i in range(N_SAMPLES)]

# Row metadata (genes)
row_df = pd.DataFrame(
    {
        "gene_name": [f"Gene_{i}" for i in range(N_GENES)],
        "gene_type": np.random.choice(["protein_coding", "lncRNA", "pseudogene"], N_GENES),
    },
    index=gene_ids,
)
row_df.index.name = "gene_id"

row_frame = CellArrayFrame.create(os.path.join(DATA_DIR, "row_data.tdb"), row_df)
print(f"Row frame: {len(row_frame[:].index)} rows, columns: {row_frame.column_names}")
print(row_frame[:].head())

Row frame: 100 rows, columns: ['gene_name', 'gene_type']
          gene_name   gene_type
gene_id                        
ENSG00000    Gene_0      lncRNA
ENSG00001    Gene_1  pseudogene
ENSG00002    Gene_2  pseudogene
ENSG00003    Gene_3      lncRNA
ENSG00004    Gene_4  pseudogene

# Column metadata (samples)
col_df = pd.DataFrame(
    {
        "tissue": np.random.choice(["liver", "kidney", "brain", "heart"], N_SAMPLES),
        "treatment": np.random.choice(["control", "treated"], N_SAMPLES),
    },
    index=sample_ids,
)
col_df.index.name = "sample_id"

col_frame = CellArrayFrame.create(os.path.join(DATA_DIR, "col_data.tdb"), col_df)
print(f"Col frame: {len(col_frame[:].index)} rows, columns: {col_frame.column_names}")
print(col_frame[:])

Col frame: 20 rows, columns: ['tissue', 'treatment']
            tissue treatment
sample_id                   
SAMPLE_000  kidney   control
SAMPLE_001   liver   control
SAMPLE_002   brain   control
SAMPLE_003   liver   control
SAMPLE_004  kidney   treated
SAMPLE_005  kidney   treated
SAMPLE_006   brain   control
SAMPLE_007  kidney   control
SAMPLE_008   liver   control
SAMPLE_009  kidney   treated
SAMPLE_010   liver   control
SAMPLE_011   liver   treated
SAMPLE_012  kidney   treated
SAMPLE_013   brain   treated
SAMPLE_014   heart   control
SAMPLE_015  kidney   treated
SAMPLE_016   liver   control
SAMPLE_017   liver   treated
SAMPLE_018  kidney   control
SAMPLE_019   brain   treated

2. Initialize CellArraySE¶

se = CellArraySE(
    assays={"counts": counts, "logcounts": logcounts},
    row_data=row_frame,
    col_data=col_frame,
)

print(se)
se.show(n=5)

<CellArraySE: 100x20 | counts, logcounts>
CellArraySE Object | 100 rows x 20 cols
Assays: counts, logcounts

--- Row Data ---
          gene_name   gene_type
gene_id                        
ENSG00000    Gene_0      lncRNA
ENSG00001    Gene_1  pseudogene
ENSG00002    Gene_2  pseudogene
ENSG00003    Gene_3      lncRNA
ENSG00004    Gene_4  pseudogene

--- Column Data ---
            tissue treatment
sample_id                   
SAMPLE_000  kidney   control
SAMPLE_001   liver   control
SAMPLE_002   brain   control
SAMPLE_003   liver   control
SAMPLE_004  kidney   treated

3. Properties¶

print(f"Shape: {se.shape}")
print(f"Assays: {se.assay_names}")
print(f"Row columns: {se.row_columns}")
print(f"Col columns: {se.col_columns}")
print(f"\nRow names (first 5): {list(se.row_names[:5])}")
print(f"Col names (first 5): {list(se.col_names[:5])}")

Shape: (100, 20)
Assays: ['counts', 'logcounts']
Row columns: ['gene_name', 'gene_type']
Col columns: ['tissue', 'treatment']

Row names (first 5): ['ENSG00000', 'ENSG00001', 'ENSG00002', 'ENSG00003', 'ENSG00004']
Col names (first 5): ['SAMPLE_000', 'SAMPLE_001', 'SAMPLE_002', 'SAMPLE_003', 'SAMPLE_004']

4. Slicing¶

Two ways to slice:

se[rows, cols] - bracket notation
se.slice(...) - method with query support

Both return a SummarizedExperiment object.

# Positional slicing
subset = se[0:10, 0:5]
print(f"se[0:10, 0:5] -> {type(subset).__name__} {subset.shape}")

se[0:10, 0:5] -> SummarizedExperiment (10, 5)

# Slicing by NAME (requires string-indexed frames)
subset = se[["ENSG00001", "ENSG00010", "ENSG00050"], ["SAMPLE_000", "SAMPLE_005"]]
print(f"Sliced by name: {subset.shape}")
print(f"Row names: {list(subset.row_names)}")
print(f"Col names: {list(subset.column_names)}")

Sliced by name: (3, 2)
Row names: ['ENSG00001', 'ENSG00010', 'ENSG00050']
Col names: ['SAMPLE_000', 'SAMPLE_005']

5. Query Filtering¶

Use TileDB query strings to filter rows/columns. Requires string-indexed frames.

# Filter by gene_type
subset = se.slice(row_query="gene_type == 'protein_coding'")
print(f"Protein-coding genes: {subset.shape[0]}")
print(f"Row names (first 5): {list(subset.row_names[:5])}")

Protein-coding genes: 35
Row names (first 5): ['ENSG00007', 'ENSG00011', 'ENSG00014', 'ENSG00021', 'ENSG00022']

# Filter by tissue
subset = se.slice(col_query="tissue == 'liver'")
print(f"Liver samples: {subset.shape[1]}")
print(f"Col names: {list(subset.column_names)}")

Liver samples: 7
Col names: ['SAMPLE_001', 'SAMPLE_003', 'SAMPLE_008', 'SAMPLE_010', 'SAMPLE_011', 'SAMPLE_016', 'SAMPLE_017']

# Combined query
subset = se.slice(
    row_query="gene_type == 'protein_coding'",
    col_query="treatment == 'control'",
)
print(f"Protein-coding genes in control samples: {subset.shape}")

Protein-coding genes in control samples: (35, 11)

6. Access Assay Data¶

# Get a subset and access the assay data
subset = se[0:5, 0:3]
print(f"Assay names: {subset.assay_names}")
print("\nCounts (raw):")
print(subset.assays["counts"])
print("\nLogcounts (log-normalized):")
print(subset.assays["logcounts"])

Assay names: ['counts', 'logcounts']

Counts (raw):
[[102. 435. 860.]
 [661. 308. 769.]
 [ 58. 510. 681.]
 [646.  20. 840.]
 [508. 775. 942.]]

Logcounts (log-normalized):
[[3.12915439 4.45202228 5.14963962]
 [4.96023053 4.1115703  5.03848449]
 [2.59727886 4.60937154 4.91779329]
 [4.93743898 1.58900324 5.12624724]
 [4.69906758 5.02441301 5.24020736]]

# Assay introspection (no data loaded)
print(f"counts is sparse: {se.is_sparse('counts')}")
print(f"counts dtype: {se.get_assay_type('counts')}")

counts is sparse: False
counts dtype: float64

Cleanup¶

shutil.rmtree(DATA_DIR)