single-cell-data · bkmartinjr · Oct 15, 2024 · Sep 23, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/.github/workflows/python-tiledbsoma-ml.yml b/.github/workflows/python-tiledbsoma-ml.yml
@@ -3,10 +3,14 @@ name: python-tiledbsoma-ml CI
 on:
   pull_request:
     branches: ["**"]
-    paths-ignore: ['scripts/**']
+    paths-ignore:
+      - "scripts/**"
+      - "notebooks/**"
   push:
     branches: [main]
-    paths-ignore: ['scripts/**']
+    paths-ignore:
+      - "scripts/**"
+      - "notebooks/**"
   workflow_dispatch:
 
 jobs:

diff --git a/notebooks/tutorial_lightning.ipynb b/notebooks/tutorial_lightning.ipynb
@@ -0,0 +1,233 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training a model with PyTorch Lightning\n",
+    "\n",
+    "This tutorial demonstrates training a toy model with [PyTorch Lightning], using the `tiledbsoma_ml.ExperimentAxisQueryIterableDataset` class, on data from the [CZI CELLxGENE Census](https://chanzuckerberg.github.io/cellxgene-census/). It is intended for demonstration purposes only, not as an example of how to train a biologically useful model.\n",
+    "\n",
+    "For more information on these APIs, please refer to the [`tutorial_pytorch` notebook](tutorial_pytorch.ipynb).\n",
+    "\n",
+    "**Prerequisites**\n",
+    "\n",
+    "Install [`tiledbsoma_ml`], [`scikit-learn`], and [`pytorch-lightning`]:\n",
+    "\n",
+    "```bash\n",
+    "pip install tiledbsoma_ml scikit-learn pytorch-lightning\n",
+    "```\n",
+    "\n",
+    "[PyTorch Lightning]: https://lightning.ai/docs/pytorch/stable/\n",
+    "[`tiledbsoma_ml`]: https://github.com/single-cell-data/TileDB-SOMA-ML/\n",
+    "[`scikit-learn`]: https://pypi.org/project/scikit-learn/\n",
+    "[`pytorch-lightning`]: https://pypi.org/project/pytorch-lightning/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialize SOMA Experiment query as training data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pytorch_lightning as pl\n",
+    "import tiledbsoma as soma\n",
+    "import torch\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "\n",
+    "import tiledbsoma_ml as soma_ml\n",
+    "\n",
+    "CZI_Census_Homo_Sapiens_URL = \"s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens/\"\n",
+    "\n",
+    "experiment = soma.open(\n",
+    "    CZI_Census_Homo_Sapiens_URL,\n",
+    "    context=soma.SOMATileDBContext(tiledb_config={\"vfs.s3.region\": \"us-west-2\"}),\n",
+    ")\n",
+    "obs_value_filter = \"tissue_general == 'tongue' and is_primary_data == True\"\n",
+    "\n",
+    "with experiment.axis_query(\n",
+    "    measurement_name=\"RNA\", obs_query=soma.AxisQuery(value_filter=obs_value_filter)\n",
+    ") as query:\n",
+    "    obs_df = query.obs(column_names=[\"cell_type\"]).concat().to_pandas()\n",
+    "    cell_type_encoder = LabelEncoder().fit(obs_df[\"cell_type\"].unique())\n",
+    "\n",
+    "    experiment_dataset = soma_ml.ExperimentAxisQueryIterableDataset(\n",
+    "        query,\n",
+    "        X_name=\"raw\",\n",
+    "        obs_column_names=[\"cell_type\"],\n",
+    "        batch_size=128,\n",
+    "        shuffle=True,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define the Lightning module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class LogisticRegressionLightning(pl.LightningModule):\n",
+    "    def __init__(self, input_dim, output_dim, cell_type_encoder, learning_rate=1e-5):\n",
+    "        super(LogisticRegressionLightning, self).__init__()\n",
+    "        self.linear = torch.nn.Linear(input_dim, output_dim)\n",
+    "        self.cell_type_encoder = cell_type_encoder\n",
+    "        self.learning_rate = learning_rate\n",
+    "        self.loss_fn = torch.nn.CrossEntropyLoss()\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        outputs = torch.sigmoid(self.linear(x))\n",
+    "        return outputs\n",
+    "\n",
+    "    def training_step(self, batch, batch_idx):\n",
+    "        X_batch, y_batch = batch\n",
+    "        # X_batch = X_batch.float()\n",
+    "        X_batch = torch.from_numpy(X_batch).float().to(self.device)\n",
+    "\n",
+    "        # Perform prediction\n",
+    "        outputs = self(X_batch)\n",
+    "\n",
+    "        # Determine the predicted label\n",
+    "        probabilities = torch.nn.functional.softmax(outputs, 1)\n",
+    "        predictions = torch.argmax(probabilities, axis=1)\n",
+    "\n",
+    "        # Compute loss\n",
+    "        y_batch = torch.from_numpy(\n",
+    "            self.cell_type_encoder.transform(y_batch[\"cell_type\"])\n",
+    "        ).to(self.device)\n",
+    "        loss = self.loss_fn(outputs, y_batch.long())\n",
+    "\n",
+    "        # Compute accuracy\n",
+    "        train_correct = (predictions == y_batch).sum().item()\n",
+    "        train_accuracy = train_correct / len(predictions)\n",
+    "\n",
+    "        # Log loss and accuracy\n",
+    "        self.log(\"train_loss\", loss, prog_bar=True)\n",
+    "        self.log(\"train_accuracy\", train_accuracy, prog_bar=True)\n",
+    "\n",
+    "        return loss\n",
+    "\n",
+    "    def configure_optimizers(self):\n",
+    "        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n",
+    "        return optimizer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "\n",
+      "  | Name    | Type             | Params | Mode \n",
+      "-----------------------------------------------------\n",
+      "0 | linear  | Linear           | 726 K  | train\n",
+      "1 | loss_fn | CrossEntropyLoss | 0      | train\n",
+      "-----------------------------------------------------\n",
+      "726 K     Trainable params\n",
+      "0         Non-trainable params\n",
+      "726 K     Total params\n",
+      "2.905     Total estimated model params size (MB)\n",
+      "2         Modules in train mode\n",
+      "0         Modules in eval mode\n",
+      "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.\n",
+      "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/pytorch_lightning/utilities/data.py:122: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 19: 100%|██████████| 118/118 [00:08<00:00, 14.31it/s, v_num=5, train_loss=1.670, train_accuracy=0.977]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "`Trainer.fit` stopped: `max_epochs=20` reached.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 19: 100%|██████████| 118/118 [00:08<00:00, 14.28it/s, v_num=5, train_loss=1.670, train_accuracy=0.977]\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataloader = soma_ml.experiment_dataloader(experiment_dataset)\n",
+    "\n",
+    "# The size of the input dimension is the number of genes\n",
+    "input_dim = experiment_dataset.shape[1]\n",
+    "\n",
+    "# The size of the output dimension is the number of distinct cell_type values\n",
+    "output_dim = len(cell_type_encoder.classes_)\n",
+    "\n",
+    "# Initialize the PyTorch Lightning model\n",
+    "model = LogisticRegressionLightning(\n",
+    "    input_dim, output_dim, cell_type_encoder=cell_type_encoder\n",
+    ")\n",
+    "\n",
+    "# Define the PyTorch Lightning Trainer\n",
+    "trainer = pl.Trainer(max_epochs=20)\n",
+    "\n",
+    "# set precision\n",
+    "torch.set_float32_matmul_precision(\"high\")\n",
+    "\n",
+    "# Train the model\n",
+    "trainer.fit(model, train_dataloaders=dataloader)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "toymodel",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}