{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# cellarr-se Demo\n", "\n", "cellarr-se is a read-only coordinator for TileDB-backed datasets that returns\n", "standard `SummarizedExperiment` objects when sliced.\n", "\n", "Key features:\n", "- Lazy loading from TileDB arrays\n", "- String-indexed frames for name-based slicing\n", "- TileDB query filtering\n", "- Returns in-memory `SummarizedExperiment` on slice" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:24:59.278293Z", "iopub.status.busy": "2026-04-01T15:24:59.278103Z", "iopub.status.idle": "2026-04-01T15:24:59.766555Z", "shell.execute_reply": "2026-04-01T15:24:59.765502Z" } }, "outputs": [], "source": [ "import os\n", "import shutil\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from cellarr_se import CellArraySE\n", "from cellarr_array import create_cellarray\n", "from cellarr_frame import CellArrayFrame" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:24:59.769336Z", "iopub.status.busy": "2026-04-01T15:24:59.769022Z", "iopub.status.idle": "2026-04-01T15:24:59.781085Z", "shell.execute_reply": "2026-04-01T15:24:59.778867Z" } }, "outputs": [], "source": [ "# Setup\n", "DATA_DIR = os.path.abspath(\"../files/demo\")\n", "if os.path.exists(DATA_DIR):\n", " shutil.rmtree(DATA_DIR)\n", "os.makedirs(DATA_DIR)\n", "\n", "N_GENES = 100\n", "N_SAMPLES = 20" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Create TileDB-backed Components" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:24:59.783159Z", "iopub.status.busy": "2026-04-01T15:24:59.782975Z", "iopub.status.idle": "2026-04-01T15:24:59.894720Z", "shell.execute_reply": "2026-04-01T15:24:59.893794Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Counts array: (100, 20)\n", "Logcounts array: (100, 20)\n" ] } ], "source": [ "# Create assays (counts and log-normalized counts)\n", "np.random.seed(42)\n", "\n", "# Raw counts\n", "counts_data = np.random.randint(0, 1000, (N_GENES, N_SAMPLES)).astype(np.float64)\n", "counts = create_cellarray(\n", " uri=os.path.join(DATA_DIR, \"counts.tdb\"),\n", " shape=(N_GENES, N_SAMPLES),\n", " attr_dtype=np.float64,\n", " sparse=False,\n", ")\n", "counts.write_batch(counts_data, start_row=0)\n", "\n", "# Log-normalized counts: log1p(counts / size_factor * 10000)\n", "size_factors = counts_data.sum(axis=0) / 1e4\n", "logcounts_data = np.log1p(counts_data / size_factors)\n", "logcounts = create_cellarray(\n", " uri=os.path.join(DATA_DIR, \"logcounts.tdb\"),\n", " shape=(N_GENES, N_SAMPLES),\n", " attr_dtype=np.float64,\n", " sparse=False,\n", ")\n", "logcounts.write_batch(logcounts_data, start_row=0)\n", "\n", "print(f\"Counts array: {counts.shape}\")\n", "print(f\"Logcounts array: {logcounts.shape}\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:24:59.948817Z", "iopub.status.busy": "2026-04-01T15:24:59.948517Z", "iopub.status.idle": "2026-04-01T15:25:00.017173Z", "shell.execute_reply": "2026-04-01T15:25:00.016278Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Row frame: 100 rows, columns: ['gene_name', 'gene_type']\n", " gene_name gene_type\n", "gene_id \n", "ENSG00000 Gene_0 lncRNA\n", "ENSG00001 Gene_1 pseudogene\n", "ENSG00002 Gene_2 pseudogene\n", "ENSG00003 Gene_3 lncRNA\n", "ENSG00004 Gene_4 pseudogene\n" ] } ], "source": [ "# Create metadata frames with STRING INDICES\n", "# This enables name-based slicing and query filtering\n", "\n", "gene_ids = [f\"ENSG{i:05d}\" for i in range(N_GENES)]\n", "sample_ids = [f\"SAMPLE_{i:03d}\" for i in range(N_SAMPLES)]\n", "\n", "# Row metadata (genes)\n", "row_df = pd.DataFrame(\n", " {\n", " \"gene_name\": [f\"Gene_{i}\" for i in range(N_GENES)],\n", " \"gene_type\": np.random.choice([\"protein_coding\", \"lncRNA\", \"pseudogene\"], N_GENES),\n", " },\n", " index=gene_ids,\n", ")\n", "row_df.index.name = \"gene_id\"\n", "\n", "row_frame = CellArrayFrame.create(os.path.join(DATA_DIR, \"row_data.tdb\"), row_df)\n", "print(f\"Row frame: {len(row_frame[:].index)} rows, columns: {row_frame.column_names}\")\n", "print(row_frame[:].head())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:25:00.019702Z", "iopub.status.busy": "2026-04-01T15:25:00.019531Z", "iopub.status.idle": "2026-04-01T15:25:00.066545Z", "shell.execute_reply": "2026-04-01T15:25:00.065955Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Col frame: 20 rows, columns: ['tissue', 'treatment']\n", " tissue treatment\n", "sample_id \n", "SAMPLE_000 kidney control\n", "SAMPLE_001 liver control\n", "SAMPLE_002 brain control\n", "SAMPLE_003 liver control\n", "SAMPLE_004 kidney treated\n", "SAMPLE_005 kidney treated\n", "SAMPLE_006 brain control\n", "SAMPLE_007 kidney control\n", "SAMPLE_008 liver control\n", "SAMPLE_009 kidney treated\n", "SAMPLE_010 liver control\n", "SAMPLE_011 liver treated\n", "SAMPLE_012 kidney treated\n", "SAMPLE_013 brain treated\n", "SAMPLE_014 heart control\n", "SAMPLE_015 kidney treated\n", "SAMPLE_016 liver control\n", "SAMPLE_017 liver treated\n", "SAMPLE_018 kidney control\n", "SAMPLE_019 brain treated\n" ] } ], "source": [ "# Column metadata (samples)\n", "col_df = pd.DataFrame(\n", " {\n", " \"tissue\": np.random.choice([\"liver\", \"kidney\", \"brain\", \"heart\"], N_SAMPLES),\n", " \"treatment\": np.random.choice([\"control\", \"treated\"], N_SAMPLES),\n", " },\n", " index=sample_ids,\n", ")\n", "col_df.index.name = \"sample_id\"\n", "\n", "col_frame = CellArrayFrame.create(os.path.join(DATA_DIR, \"col_data.tdb\"), col_df)\n", "print(f\"Col frame: {len(col_frame[:].index)} rows, columns: {col_frame.column_names}\")\n", "print(col_frame[:])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Initialize CellArraySE" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:25:00.068155Z", "iopub.status.busy": "2026-04-01T15:25:00.068002Z", "iopub.status.idle": "2026-04-01T15:25:00.098627Z", "shell.execute_reply": "2026-04-01T15:25:00.097560Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "CellArraySE Object | 100 rows x 20 cols\n", "Assays: counts, logcounts\n", "\n", "--- Row Data ---\n", " gene_name gene_type\n", "gene_id \n", "ENSG00000 Gene_0 lncRNA\n", "ENSG00001 Gene_1 pseudogene\n", "ENSG00002 Gene_2 pseudogene\n", "ENSG00003 Gene_3 lncRNA\n", "ENSG00004 Gene_4 pseudogene\n", "\n", "--- Column Data ---\n", " tissue treatment\n", "sample_id \n", "SAMPLE_000 kidney control\n", "SAMPLE_001 liver control\n", "SAMPLE_002 brain control\n", "SAMPLE_003 liver control\n", "SAMPLE_004 kidney treated\n" ] } ], "source": [ "se = CellArraySE(\n", " assays={\"counts\": counts, \"logcounts\": logcounts},\n", " row_data=row_frame,\n", " col_data=col_frame,\n", ")\n", "\n", "print(se)\n", "se.show(n=5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Properties" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:25:00.101114Z", "iopub.status.busy": "2026-04-01T15:25:00.100464Z", "iopub.status.idle": "2026-04-01T15:25:00.105086Z", "shell.execute_reply": "2026-04-01T15:25:00.104490Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape: (100, 20)\n", "Assays: ['counts', 'logcounts']\n", "Row columns: ['gene_name', 'gene_type']\n", "Col columns: ['tissue', 'treatment']\n", "\n", "Row names (first 5): ['ENSG00000', 'ENSG00001', 'ENSG00002', 'ENSG00003', 'ENSG00004']\n", "Col names (first 5): ['SAMPLE_000', 'SAMPLE_001', 'SAMPLE_002', 'SAMPLE_003', 'SAMPLE_004']\n" ] } ], "source": [ "print(f\"Shape: {se.shape}\")\n", "print(f\"Assays: {se.assay_names}\")\n", "print(f\"Row columns: {se.row_columns}\")\n", "print(f\"Col columns: {se.col_columns}\")\n", "print(f\"\\nRow names (first 5): {list(se.row_names[:5])}\")\n", "print(f\"Col names (first 5): {list(se.col_names[:5])}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Slicing\n", "\n", "Two ways to slice:\n", "- `se[rows, cols]` - bracket notation\n", "- `se.slice(...)` - method with query support\n", "\n", "Both return a `SummarizedExperiment` object." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:25:00.107255Z", "iopub.status.busy": "2026-04-01T15:25:00.107067Z", "iopub.status.idle": "2026-04-01T15:25:00.133759Z", "shell.execute_reply": "2026-04-01T15:25:00.133016Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "se[0:10, 0:5] -> SummarizedExperiment (10, 5)\n" ] } ], "source": [ "# Positional slicing\n", "subset = se[0:10, 0:5]\n", "print(f\"se[0:10, 0:5] -> {type(subset).__name__} {subset.shape}\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:25:00.135729Z", "iopub.status.busy": "2026-04-01T15:25:00.135537Z", "iopub.status.idle": "2026-04-01T15:25:00.156491Z", "shell.execute_reply": "2026-04-01T15:25:00.155511Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sliced by name: (3, 2)\n", "Row names: ['ENSG00001', 'ENSG00010', 'ENSG00050']\n", "Col names: ['SAMPLE_000', 'SAMPLE_005']\n" ] } ], "source": [ "# Slicing by NAME (requires string-indexed frames)\n", "subset = se[[\"ENSG00001\", \"ENSG00010\", \"ENSG00050\"], [\"SAMPLE_000\", \"SAMPLE_005\"]]\n", "print(f\"Sliced by name: {subset.shape}\")\n", "print(f\"Row names: {list(subset.row_names)}\")\n", "print(f\"Col names: {list(subset.column_names)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Query Filtering\n", "\n", "Use TileDB query strings to filter rows/columns.\n", "Requires string-indexed frames." ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:25:00.157964Z", "iopub.status.busy": "2026-04-01T15:25:00.157814Z", "iopub.status.idle": "2026-04-01T15:25:00.180982Z", "shell.execute_reply": "2026-04-01T15:25:00.180032Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Protein-coding genes: 35\n", "Row names (first 5): ['ENSG00007', 'ENSG00011', 'ENSG00014', 'ENSG00021', 'ENSG00022']\n" ] } ], "source": [ "# Filter by gene_type\n", "subset = se.slice(row_query=\"gene_type == 'protein_coding'\")\n", "print(f\"Protein-coding genes: {subset.shape[0]}\")\n", "print(f\"Row names (first 5): {list(subset.row_names[:5])}\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:25:00.183215Z", "iopub.status.busy": "2026-04-01T15:25:00.183062Z", "iopub.status.idle": "2026-04-01T15:25:00.201994Z", "shell.execute_reply": "2026-04-01T15:25:00.201058Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Liver samples: 7\n", "Col names: ['SAMPLE_001', 'SAMPLE_003', 'SAMPLE_008', 'SAMPLE_010', 'SAMPLE_011', 'SAMPLE_016', 'SAMPLE_017']\n" ] } ], "source": [ "# Filter by tissue\n", "subset = se.slice(col_query=\"tissue == 'liver'\")\n", "print(f\"Liver samples: {subset.shape[1]}\")\n", "print(f\"Col names: {list(subset.column_names)}\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:25:00.204290Z", "iopub.status.busy": "2026-04-01T15:25:00.204057Z", "iopub.status.idle": "2026-04-01T15:25:00.231004Z", "shell.execute_reply": "2026-04-01T15:25:00.230251Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Protein-coding genes in control samples: (35, 11)\n" ] } ], "source": [ "# Combined query\n", "subset = se.slice(\n", " row_query=\"gene_type == 'protein_coding'\",\n", " col_query=\"treatment == 'control'\",\n", ")\n", "print(f\"Protein-coding genes in control samples: {subset.shape}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Access Assay Data" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:25:00.232541Z", "iopub.status.busy": "2026-04-01T15:25:00.232401Z", "iopub.status.idle": "2026-04-01T15:25:00.263032Z", "shell.execute_reply": "2026-04-01T15:25:00.262259Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assay names: ['counts', 'logcounts']" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Counts (raw):\n", "[[102. 435. 860.]\n", " [661. 308. 769.]\n", " [ 58. 510. 681.]\n", " [646. 20. 840.]\n", " [508. 775. 942.]]\n", "\n", "Logcounts (log-normalized):\n", "[[3.12915439 4.45202228 5.14963962]\n", " [4.96023053 4.1115703 5.03848449]\n", " [2.59727886 4.60937154 4.91779329]\n", " [4.93743898 1.58900324 5.12624724]\n", " [4.69906758 5.02441301 5.24020736]]\n" ] } ], "source": [ "# Get a subset and access the assay data\n", "subset = se[0:5, 0:3]\n", "print(f\"Assay names: {subset.assay_names}\")\n", "print(\"\\nCounts (raw):\")\n", "print(subset.assays[\"counts\"])\n", "print(\"\\nLogcounts (log-normalized):\")\n", "print(subset.assays[\"logcounts\"])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:25:00.264474Z", "iopub.status.busy": "2026-04-01T15:25:00.264326Z", "iopub.status.idle": "2026-04-01T15:25:00.267913Z", "shell.execute_reply": "2026-04-01T15:25:00.267269Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "counts is sparse: False\n", "counts dtype: float64\n" ] } ], "source": [ "# Assay introspection (no data loaded)\n", "print(f\"counts is sparse: {se.is_sparse('counts')}\")\n", "print(f\"counts dtype: {se.get_assay_type('counts')}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cleanup" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "execution": { "iopub.execute_input": "2026-04-01T15:25:00.269471Z", "iopub.status.busy": "2026-04-01T15:25:00.269327Z", "iopub.status.idle": "2026-04-01T15:25:00.281112Z", "shell.execute_reply": "2026-04-01T15:25:00.278573Z" } }, "outputs": [], "source": [ "shutil.rmtree(DATA_DIR)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.15" } }, "nbformat": 4, "nbformat_minor": 4 }