{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# cellarr-se Demo\n",
    "\n",
    "cellarr-se is a read-only coordinator for TileDB-backed datasets that returns\n",
    "standard `SummarizedExperiment` objects when sliced.\n",
    "\n",
    "Key features:\n",
    "- Lazy loading from TileDB arrays\n",
    "- String-indexed frames for name-based slicing\n",
    "- TileDB query filtering\n",
    "- Returns in-memory `SummarizedExperiment` on slice"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:24:59.278293Z",
     "iopub.status.busy": "2026-04-01T15:24:59.278103Z",
     "iopub.status.idle": "2026-04-01T15:24:59.766555Z",
     "shell.execute_reply": "2026-04-01T15:24:59.765502Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import shutil\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from cellarr_se import CellArraySE\n",
    "from cellarr_array import create_cellarray\n",
    "from cellarr_frame import CellArrayFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:24:59.769336Z",
     "iopub.status.busy": "2026-04-01T15:24:59.769022Z",
     "iopub.status.idle": "2026-04-01T15:24:59.781085Z",
     "shell.execute_reply": "2026-04-01T15:24:59.778867Z"
    }
   },
   "outputs": [],
   "source": [
    "# Setup\n",
    "DATA_DIR = os.path.abspath(\"../files/demo\")\n",
    "if os.path.exists(DATA_DIR):\n",
    "    shutil.rmtree(DATA_DIR)\n",
    "os.makedirs(DATA_DIR)\n",
    "\n",
    "N_GENES = 100\n",
    "N_SAMPLES = 20"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Create TileDB-backed Components"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:24:59.783159Z",
     "iopub.status.busy": "2026-04-01T15:24:59.782975Z",
     "iopub.status.idle": "2026-04-01T15:24:59.894720Z",
     "shell.execute_reply": "2026-04-01T15:24:59.893794Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Counts array: (100, 20)\n",
      "Logcounts array: (100, 20)\n"
     ]
    }
   ],
   "source": [
    "# Create assays (counts and log-normalized counts)\n",
    "np.random.seed(42)\n",
    "\n",
    "# Raw counts\n",
    "counts_data = np.random.randint(0, 1000, (N_GENES, N_SAMPLES)).astype(np.float64)\n",
    "counts = create_cellarray(\n",
    "    uri=os.path.join(DATA_DIR, \"counts.tdb\"),\n",
    "    shape=(N_GENES, N_SAMPLES),\n",
    "    attr_dtype=np.float64,\n",
    "    sparse=False,\n",
    ")\n",
    "counts.write_batch(counts_data, start_row=0)\n",
    "\n",
    "# Log-normalized counts: log1p(counts / size_factor * 10000)\n",
    "size_factors = counts_data.sum(axis=0) / 1e4\n",
    "logcounts_data = np.log1p(counts_data / size_factors)\n",
    "logcounts = create_cellarray(\n",
    "    uri=os.path.join(DATA_DIR, \"logcounts.tdb\"),\n",
    "    shape=(N_GENES, N_SAMPLES),\n",
    "    attr_dtype=np.float64,\n",
    "    sparse=False,\n",
    ")\n",
    "logcounts.write_batch(logcounts_data, start_row=0)\n",
    "\n",
    "print(f\"Counts array: {counts.shape}\")\n",
    "print(f\"Logcounts array: {logcounts.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:24:59.948817Z",
     "iopub.status.busy": "2026-04-01T15:24:59.948517Z",
     "iopub.status.idle": "2026-04-01T15:25:00.017173Z",
     "shell.execute_reply": "2026-04-01T15:25:00.016278Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Row frame: 100 rows, columns: ['gene_name', 'gene_type']\n",
      "          gene_name   gene_type\n",
      "gene_id                        \n",
      "ENSG00000    Gene_0      lncRNA\n",
      "ENSG00001    Gene_1  pseudogene\n",
      "ENSG00002    Gene_2  pseudogene\n",
      "ENSG00003    Gene_3      lncRNA\n",
      "ENSG00004    Gene_4  pseudogene\n"
     ]
    }
   ],
   "source": [
    "# Create metadata frames with STRING INDICES\n",
    "# This enables name-based slicing and query filtering\n",
    "\n",
    "gene_ids = [f\"ENSG{i:05d}\" for i in range(N_GENES)]\n",
    "sample_ids = [f\"SAMPLE_{i:03d}\" for i in range(N_SAMPLES)]\n",
    "\n",
    "# Row metadata (genes)\n",
    "row_df = pd.DataFrame(\n",
    "    {\n",
    "        \"gene_name\": [f\"Gene_{i}\" for i in range(N_GENES)],\n",
    "        \"gene_type\": np.random.choice([\"protein_coding\", \"lncRNA\", \"pseudogene\"], N_GENES),\n",
    "    },\n",
    "    index=gene_ids,\n",
    ")\n",
    "row_df.index.name = \"gene_id\"\n",
    "\n",
    "row_frame = CellArrayFrame.create(os.path.join(DATA_DIR, \"row_data.tdb\"), row_df)\n",
    "print(f\"Row frame: {len(row_frame[:].index)} rows, columns: {row_frame.column_names}\")\n",
    "print(row_frame[:].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:25:00.019702Z",
     "iopub.status.busy": "2026-04-01T15:25:00.019531Z",
     "iopub.status.idle": "2026-04-01T15:25:00.066545Z",
     "shell.execute_reply": "2026-04-01T15:25:00.065955Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Col frame: 20 rows, columns: ['tissue', 'treatment']\n",
      "            tissue treatment\n",
      "sample_id                   \n",
      "SAMPLE_000  kidney   control\n",
      "SAMPLE_001   liver   control\n",
      "SAMPLE_002   brain   control\n",
      "SAMPLE_003   liver   control\n",
      "SAMPLE_004  kidney   treated\n",
      "SAMPLE_005  kidney   treated\n",
      "SAMPLE_006   brain   control\n",
      "SAMPLE_007  kidney   control\n",
      "SAMPLE_008   liver   control\n",
      "SAMPLE_009  kidney   treated\n",
      "SAMPLE_010   liver   control\n",
      "SAMPLE_011   liver   treated\n",
      "SAMPLE_012  kidney   treated\n",
      "SAMPLE_013   brain   treated\n",
      "SAMPLE_014   heart   control\n",
      "SAMPLE_015  kidney   treated\n",
      "SAMPLE_016   liver   control\n",
      "SAMPLE_017   liver   treated\n",
      "SAMPLE_018  kidney   control\n",
      "SAMPLE_019   brain   treated\n"
     ]
    }
   ],
   "source": [
    "# Column metadata (samples)\n",
    "col_df = pd.DataFrame(\n",
    "    {\n",
    "        \"tissue\": np.random.choice([\"liver\", \"kidney\", \"brain\", \"heart\"], N_SAMPLES),\n",
    "        \"treatment\": np.random.choice([\"control\", \"treated\"], N_SAMPLES),\n",
    "    },\n",
    "    index=sample_ids,\n",
    ")\n",
    "col_df.index.name = \"sample_id\"\n",
    "\n",
    "col_frame = CellArrayFrame.create(os.path.join(DATA_DIR, \"col_data.tdb\"), col_df)\n",
    "print(f\"Col frame: {len(col_frame[:].index)} rows, columns: {col_frame.column_names}\")\n",
    "print(col_frame[:])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Initialize CellArraySE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:25:00.068155Z",
     "iopub.status.busy": "2026-04-01T15:25:00.068002Z",
     "iopub.status.idle": "2026-04-01T15:25:00.098627Z",
     "shell.execute_reply": "2026-04-01T15:25:00.097560Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<CellArraySE: 100x20 | counts, logcounts>\n",
      "CellArraySE Object | 100 rows x 20 cols\n",
      "Assays: counts, logcounts\n",
      "\n",
      "--- Row Data ---\n",
      "          gene_name   gene_type\n",
      "gene_id                        \n",
      "ENSG00000    Gene_0      lncRNA\n",
      "ENSG00001    Gene_1  pseudogene\n",
      "ENSG00002    Gene_2  pseudogene\n",
      "ENSG00003    Gene_3      lncRNA\n",
      "ENSG00004    Gene_4  pseudogene\n",
      "\n",
      "--- Column Data ---\n",
      "            tissue treatment\n",
      "sample_id                   \n",
      "SAMPLE_000  kidney   control\n",
      "SAMPLE_001   liver   control\n",
      "SAMPLE_002   brain   control\n",
      "SAMPLE_003   liver   control\n",
      "SAMPLE_004  kidney   treated\n"
     ]
    }
   ],
   "source": [
    "se = CellArraySE(\n",
    "    assays={\"counts\": counts, \"logcounts\": logcounts},\n",
    "    row_data=row_frame,\n",
    "    col_data=col_frame,\n",
    ")\n",
    "\n",
    "print(se)\n",
    "se.show(n=5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Properties"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:25:00.101114Z",
     "iopub.status.busy": "2026-04-01T15:25:00.100464Z",
     "iopub.status.idle": "2026-04-01T15:25:00.105086Z",
     "shell.execute_reply": "2026-04-01T15:25:00.104490Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape: (100, 20)\n",
      "Assays: ['counts', 'logcounts']\n",
      "Row columns: ['gene_name', 'gene_type']\n",
      "Col columns: ['tissue', 'treatment']\n",
      "\n",
      "Row names (first 5): ['ENSG00000', 'ENSG00001', 'ENSG00002', 'ENSG00003', 'ENSG00004']\n",
      "Col names (first 5): ['SAMPLE_000', 'SAMPLE_001', 'SAMPLE_002', 'SAMPLE_003', 'SAMPLE_004']\n"
     ]
    }
   ],
   "source": [
    "print(f\"Shape: {se.shape}\")\n",
    "print(f\"Assays: {se.assay_names}\")\n",
    "print(f\"Row columns: {se.row_columns}\")\n",
    "print(f\"Col columns: {se.col_columns}\")\n",
    "print(f\"\\nRow names (first 5): {list(se.row_names[:5])}\")\n",
    "print(f\"Col names (first 5): {list(se.col_names[:5])}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Slicing\n",
    "\n",
    "Two ways to slice:\n",
    "- `se[rows, cols]` - bracket notation\n",
    "- `se.slice(...)` - method with query support\n",
    "\n",
    "Both return a `SummarizedExperiment` object."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:25:00.107255Z",
     "iopub.status.busy": "2026-04-01T15:25:00.107067Z",
     "iopub.status.idle": "2026-04-01T15:25:00.133759Z",
     "shell.execute_reply": "2026-04-01T15:25:00.133016Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "se[0:10, 0:5] -> SummarizedExperiment (10, 5)\n"
     ]
    }
   ],
   "source": [
    "# Positional slicing\n",
    "subset = se[0:10, 0:5]\n",
    "print(f\"se[0:10, 0:5] -> {type(subset).__name__} {subset.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:25:00.135729Z",
     "iopub.status.busy": "2026-04-01T15:25:00.135537Z",
     "iopub.status.idle": "2026-04-01T15:25:00.156491Z",
     "shell.execute_reply": "2026-04-01T15:25:00.155511Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sliced by name: (3, 2)\n",
      "Row names: ['ENSG00001', 'ENSG00010', 'ENSG00050']\n",
      "Col names: ['SAMPLE_000', 'SAMPLE_005']\n"
     ]
    }
   ],
   "source": [
    "# Slicing by NAME (requires string-indexed frames)\n",
    "subset = se[[\"ENSG00001\", \"ENSG00010\", \"ENSG00050\"], [\"SAMPLE_000\", \"SAMPLE_005\"]]\n",
    "print(f\"Sliced by name: {subset.shape}\")\n",
    "print(f\"Row names: {list(subset.row_names)}\")\n",
    "print(f\"Col names: {list(subset.column_names)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Query Filtering\n",
    "\n",
    "Use TileDB query strings to filter rows/columns.\n",
    "Requires string-indexed frames."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:25:00.157964Z",
     "iopub.status.busy": "2026-04-01T15:25:00.157814Z",
     "iopub.status.idle": "2026-04-01T15:25:00.180982Z",
     "shell.execute_reply": "2026-04-01T15:25:00.180032Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Protein-coding genes: 35\n",
      "Row names (first 5): ['ENSG00007', 'ENSG00011', 'ENSG00014', 'ENSG00021', 'ENSG00022']\n"
     ]
    }
   ],
   "source": [
    "# Filter by gene_type\n",
    "subset = se.slice(row_query=\"gene_type == 'protein_coding'\")\n",
    "print(f\"Protein-coding genes: {subset.shape[0]}\")\n",
    "print(f\"Row names (first 5): {list(subset.row_names[:5])}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:25:00.183215Z",
     "iopub.status.busy": "2026-04-01T15:25:00.183062Z",
     "iopub.status.idle": "2026-04-01T15:25:00.201994Z",
     "shell.execute_reply": "2026-04-01T15:25:00.201058Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Liver samples: 7\n",
      "Col names: ['SAMPLE_001', 'SAMPLE_003', 'SAMPLE_008', 'SAMPLE_010', 'SAMPLE_011', 'SAMPLE_016', 'SAMPLE_017']\n"
     ]
    }
   ],
   "source": [
    "# Filter by tissue\n",
    "subset = se.slice(col_query=\"tissue == 'liver'\")\n",
    "print(f\"Liver samples: {subset.shape[1]}\")\n",
    "print(f\"Col names: {list(subset.column_names)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:25:00.204290Z",
     "iopub.status.busy": "2026-04-01T15:25:00.204057Z",
     "iopub.status.idle": "2026-04-01T15:25:00.231004Z",
     "shell.execute_reply": "2026-04-01T15:25:00.230251Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Protein-coding genes in control samples: (35, 11)\n"
     ]
    }
   ],
   "source": [
    "# Combined query\n",
    "subset = se.slice(\n",
    "    row_query=\"gene_type == 'protein_coding'\",\n",
    "    col_query=\"treatment == 'control'\",\n",
    ")\n",
    "print(f\"Protein-coding genes in control samples: {subset.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Access Assay Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:25:00.232541Z",
     "iopub.status.busy": "2026-04-01T15:25:00.232401Z",
     "iopub.status.idle": "2026-04-01T15:25:00.263032Z",
     "shell.execute_reply": "2026-04-01T15:25:00.262259Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Assay names: ['counts', 'logcounts']"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Counts (raw):\n",
      "[[102. 435. 860.]\n",
      " [661. 308. 769.]\n",
      " [ 58. 510. 681.]\n",
      " [646.  20. 840.]\n",
      " [508. 775. 942.]]\n",
      "\n",
      "Logcounts (log-normalized):\n",
      "[[3.12915439 4.45202228 5.14963962]\n",
      " [4.96023053 4.1115703  5.03848449]\n",
      " [2.59727886 4.60937154 4.91779329]\n",
      " [4.93743898 1.58900324 5.12624724]\n",
      " [4.69906758 5.02441301 5.24020736]]\n"
     ]
    }
   ],
   "source": [
    "# Get a subset and access the assay data\n",
    "subset = se[0:5, 0:3]\n",
    "print(f\"Assay names: {subset.assay_names}\")\n",
    "print(\"\\nCounts (raw):\")\n",
    "print(subset.assays[\"counts\"])\n",
    "print(\"\\nLogcounts (log-normalized):\")\n",
    "print(subset.assays[\"logcounts\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:25:00.264474Z",
     "iopub.status.busy": "2026-04-01T15:25:00.264326Z",
     "iopub.status.idle": "2026-04-01T15:25:00.267913Z",
     "shell.execute_reply": "2026-04-01T15:25:00.267269Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "counts is sparse: False\n",
      "counts dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# Assay introspection (no data loaded)\n",
    "print(f\"counts is sparse: {se.is_sparse('counts')}\")\n",
    "print(f\"counts dtype: {se.get_assay_type('counts')}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cleanup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-01T15:25:00.269471Z",
     "iopub.status.busy": "2026-04-01T15:25:00.269327Z",
     "iopub.status.idle": "2026-04-01T15:25:00.281112Z",
     "shell.execute_reply": "2026-04-01T15:25:00.278573Z"
    }
   },
   "outputs": [],
   "source": [
    "shutil.rmtree(DATA_DIR)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}