VectorInstitute · sanaAyrml · Jan 6, 2025 · Jan 6, 2025 · Jan 6, 2025 · Jan 6, 2025
diff --git a/fl4health/datasets/rxrx1/README.md b/fl4health/datasets/rxrx1/README.md
@@ -0,0 +1,28 @@
+# Fluorescent Microscopy Images Dataset Download and Preprocessing
+
+This repository provides a set of scripts to download and preprocess RxRx1 datasets for use in federated learning experiments. This dataset include 6-channel fluorescent microscopy images of cells treated with different compounds. The dataset is provided by Recursion Pharmaceuticals and is available on the [RxRx1 Kaggle page](https://www.rxrx.ai/rxrx1).
+
+## Getting Started
+
+To start using these datasets, follow the steps below.
+
+
+### Downloading the Datasets
+To use the datasets for this project, run the provided shell script to download and unzip the required files.
+
+```sh
+sh fl4health/datasets/rxrx1/download.sh
+```
+
+
+### Preprocessing the Datasets
+
+Once the datasets are downloaded, preprocess them to generate the required metadata file and prepare the training and testing tensors for each client participating in the federated learning experiments. The preprocessing script will create the following files:
+
+```sh
+python fl4health/datasets/rxrx1/preprocess.py --data_dir <path_to_rxrx1_data>
+```
+
+### Using the Datasets
+
+After preprocessing, the datasets are ready to be used in the federated learning settings. For examples please refer to the [Rxrx1 experiments](research/rxrx1) directory.
diff --git a/fl4health/datasets/rxrx1/download.sh b/fl4health/datasets/rxrx1/download.sh
@@ -0,0 +1,43 @@
+echo "RxRx1 dataset download."
+# Define the URL and the target directory and file name
+URL="https://storage.googleapis.com/rxrx/rxrx1"
+METADATA_URL="https://storage.googleapis.com/rxrx/rxrx1/rxrx1-metadata.zip"
+DIRECTORY="/projects/fl4health/datasets/rxrx1_v1.0"
+IMAGE_FILE="rxrx1-images.zip"
+METADATA_FILE="rxrx1-metadata.zip"
+
+# Create the directory if it doesn't exist
+mkdir -p "$DIRECTORY"
+
+# Check if the file already exists
+if [ -f "$IMAGE_FILE" ]; then
+  echo "File $IMAGE_FILE already exists. No download needed."
+else
+  echo "Downloading $IMAGE_FILE"
+  wget -O "$DIRECTORY/" "$URL/$IMAGE_FILE"
+  if [ $? -eq 0 ]; then
+    echo "Download completed successfully."
+  else
+    echo "Download failed."
+  fi
+fi
+
+mkdir -p ${DIRECTORY}/images
+unzip ${DIRECTORY}/${IMAGE_FILE} -d ${DIRECTORY}/images/
+
+# Check if the file already exists
+if [ -f "$METADATA_FILE" ]; then
+  echo "File $METADATA_FILE already exists. No download needed."
+else
+  echo "Downloading $METADATA_FILE"
+  wget -O "$DIRECTORY/" "$URL/$METADATA_FILE"
+  if [ $? -eq 0 ]; then
+    echo "Download completed successfully."
+  else
+    echo "Download failed."
+  fi
+fi
+
+unzip ${DIRECTORY}/${METADATA_FILE} -d ${DIRECTORY}
+
+echo "Download completed."
diff --git a/fl4health/datasets/rxrx1/load_data.py b/fl4health/datasets/rxrx1/load_data.py
@@ -0,0 +1,171 @@
+import copy
+import os
+import pickle
+from collections import defaultdict
+from collections.abc import Callable
+from logging import INFO
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import torch
+from flwr.common.logger import log
+from torch.utils.data import DataLoader, Subset
+
+from fl4health.utils.dataset import TensorDataset
+
+
+def construct_rxrx1_tensor_dataset(
+    metadata: pd.DataFrame,
+    data_path: Path,
+    client_num: int,
+    dataset_type: str,
+    transform: Callable | None = None,
+) -> tuple[TensorDataset, dict[int, int]]:
+    """
+    Construct a TensorDataset for rxrx1 data.
+
+    Args:
+        metadata (DataFrame): A DataFrame containing image metadata.
+        data_path (Path): Root directory which the image data should be loaded.
+        client_num (int): Client number to load data for.
+        dataset_type (str): 'train' or 'test' to specify dataset type.
+        transform (Callable | None): Transformation function to apply to the images. Defaults to None.
+
+    Returns:
+        tuple[TensorDataset, dict[int, int]]: A TensorDataset containing the processed images and label map.
+
+    """
+
+    label_map = {label: idx for idx, label in enumerate(sorted(metadata["sirna_id"].unique()))}
+    original_label_map = {new_label: original_label for original_label, new_label in label_map.items()}
+    metadata = metadata[metadata["dataset"] == dataset_type]
+    targets_tensor = torch.Tensor(list(metadata["sirna_id"].map(label_map)))
+    data_list = []
+    for index in range(len(targets_tensor)):
+        with open(
+            os.path.join(data_path, f"clients/{dataset_type}_data_{client_num+1}/image_{index}.pkl"), "rb"
+        ) as file:
+            data_list.append(torch.Tensor(pickle.load(file)))
+    data_tensor = torch.cat(data_list)
+
+    return TensorDataset(data_tensor, targets_tensor, transform), original_label_map
+
+
+def label_frequency(dataset: TensorDataset | Subset, original_label_map: dict[int, int]) -> None:
+    """
+    Prints the frequency of each label in the dataset.
+
+    Args:
+        dataset (TensorDataset | Subset): The dataset to analyze.
+        original_label_map (dict[int, int]): A mapping of the original labels to their new labels.
+
+    """
+    # Extract metadata and label map
+    if isinstance(dataset, TensorDataset):
+        targets = dataset.targets
+    elif isinstance(dataset, Subset):
+        assert isinstance(dataset.dataset, TensorDataset), "Subset dataset must be an TensorDataset instance."
+        targets = dataset.dataset.targets
+    else:
+        raise TypeError("Dataset must be of type TensorDataset or Subset containing an TensorDataset.")
+
+    # Count label frequencies
+    label_to_indices = defaultdict(list)
+    assert isinstance(targets, torch.Tensor)
+    for idx, label in enumerate(targets):  # Assumes dataset[idx] returns (data, label)
+        label_to_indices[label].append(idx)
+
+    # Print frequency of labels their names
+    for label, count in label_to_indices.items():
+        assert isinstance(label, int)
+        original_label = original_label_map.get(label)
+        log(INFO, f"Label {label} (original: {original_label}): {len(count)} samples")
+
+
+def create_splits(
+    dataset: TensorDataset, seed: int | None = None, train_fraction: float = 0.8
+) -> tuple[list[int], list[int]]:
+    """
+    Splits the dataset into training and validation sets.
+
+    Args:
+        dataset (Dataset): The dataset to split.
+        train_fraction (float): Fraction of data to use for training.
+
+    Returns:
+        Tuple: (train_dataset, val_dataset)
+    """
+
+    # Group indices by label
+    label_to_indices = defaultdict(list)
+    assert isinstance(dataset.targets, torch.Tensor)
+    for idx, label in enumerate(dataset.targets):  # Assumes dataset[idx] returns (data, label)
+        label_to_indices[label.item()].append(idx)
+
+    # Stratified splitting
+    train_indices, val_indices = [], []
+    for label, indices in label_to_indices.items():
+        if seed is not None:
+            np_generator = np.random.default_rng(seed)
+            np_generator.shuffle(indices)
+        else:
+            np.random.shuffle(indices)
+        split_point = int(len(indices) * train_fraction)
+        train_indices.extend(indices[:split_point])
+        val_indices.extend(indices[split_point:])
+        if len(val_indices) == 0:
+            log(INFO, "Warning: Validation set is empty. Consider changing the train_fraction parameter.")
+
+    return train_indices, val_indices
+
+
+def load_rxrx1_data(
+    data_path: Path,
+    client_num: int,
+    batch_size: int,
+    seed: int | None = None,
+    train_val_split: float = 0.8,
+    num_workers: int = 0,
+) -> tuple[DataLoader, DataLoader, dict[str, int]]:
+
+    # Read the CSV file
+    data = pd.read_csv(f"{data_path}/clients/meta_data_{client_num+1}.csv")
+
+    dataset, _ = construct_rxrx1_tensor_dataset(data, data_path, client_num, "train")
+
+    train_indices, val_indices = create_splits(dataset, seed=seed, train_fraction=train_val_split)
+    train_set = copy.deepcopy(dataset)
+    train_set.data = train_set.data[train_indices]
+    assert train_set.targets is not None
+    train_set.targets = train_set.targets[train_indices]
+
+    validation_set = copy.deepcopy(dataset)
+    validation_set.data = validation_set.data[val_indices]
+    assert validation_set.targets is not None
+    validation_set.targets = validation_set.targets[val_indices]
+
+    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
+    validation_loader = DataLoader(validation_set, batch_size=batch_size)
+    num_examples = {
+        "train_set": len(train_set.data),
+        "validation_set": len(validation_set.data),
+    }
+
+    return train_loader, validation_loader, num_examples
+
+
+def load_rxrx1_test_data(
+    data_path: Path, client_num: int, batch_size: int, num_workers: int = 0
+) -> tuple[DataLoader, dict[str, int]]:
+
+    # Read the CSV file
+    data = pd.read_csv(f"{data_path}/clients/meta_data_{client_num+1}.csv")
+
+    dataset, _ = construct_rxrx1_tensor_dataset(data, data_path, client_num, "test")
+
+    evaluation_loader = DataLoader(
+        dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True
+    )
+    num_examples = {"eval_set": len(dataset.data)}
+    return evaluation_loader, num_examples
diff --git a/fl4health/datasets/rxrx1/preprocess.py b/fl4health/datasets/rxrx1/preprocess.py
@@ -0,0 +1,129 @@
+import argparse
+import os
+import pickle
+from logging import INFO
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+import torch
+from flwr.common.logger import log
+from PIL import Image
+from torchvision.transforms import ToTensor
+
+
+def filter_and_save_data(metadata: pd.DataFrame, top_sirna_ids: list[int], cell_type: str, output_path: Path) -> None:
+    """
+    Filters data for the given cell type and frequency of their sirna_id and saves it to a CSV file.
+
+    Args:
+        metadata (pd.DataFrame): Metadata containing information about all images.
+        top_sirna_ids (list[int]): Top sirna_id values to filter by.
+        cell_type (str): Cell type to filter by.
+        output_path (Path): Path to save the filtered metadata.
+    """
+    filtered_metadata = metadata[(metadata["sirna_id"].isin(top_sirna_ids)) & (metadata["cell_type"] == cell_type)]
+    filtered_metadata.to_csv(output_path, index=False)
+
+
+def load_image(row: dict[str, Any], root: Path) -> torch.Tensor:
+    """
+    Load an image tensor for a given row of metadata.
+
+    Args:
+        row (dict[str, Any]): A row of metadata containing experiment, plate, well, and site information.
+        root (Path): Root directory containing the image files.
+
+    Returns:
+        torch.Tensor: The loaded image tensor.
+    """
+    experiment = row["experiment"]
+    plate = row["plate"]
+    well = row["well"]
+    site = row["site"]
+
+    images = []
+    for channel in range(1, 4):
+        image_path = os.path.join(root, f"images/{experiment}/Plate{plate}/{well}_s{site}_w{channel}.png")
+        if not Path(image_path).exists():
+            raise FileNotFoundError(f"Image not found at {image_path}")
+        image = ToTensor()(Image.open(image_path).convert("L"))
+        images.append(image)
+
+    # Concatenate the three channels into one tensor
+    return torch.cat(images, dim=0)
+
+
+def process_data(metadata: pd.DataFrame, root: Path) -> torch.Tensor:
+    """
+    Process the entire dataset, loading image tensors for each row.
+
+    Args:
+        metadata (pd.DataFrame): Metadata containing information about all images.
+        root (Path): Root directory containing the image files.
+
+    Returns:
+        torch.Tensor: A single tensor containing all processed images.
+    """
+    all_tensors = []
+    for _, row in metadata.iterrows():
+        image_tensor = load_image(row.to_dict(), Path(root))
+        all_tensors.append(image_tensor)
+
+    # Stack all tensors into a single tensor
+    return torch.stack(all_tensors)
+
+
+def save_to_pkl(data: torch.Tensor, output_path: str) -> None:
+    """
+    Save data to a pickle file.
+
+    Args:
+        data (torch.Tensor): Data to save.
+        output_path (str): Path to the output pickle file.
+    """
+    with open(output_path, "wb") as f:
+        pickle.dump(data, f)
+    log(INFO, f"Data saved to {output_path}")
+
+
+def main(dataset_dir: Path) -> None:
+    metadata_file = os.path.join(dataset_dir, "metadata.csv")
+    output_dir = os.path.join(dataset_dir, "clients")
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    data = pd.read_csv(metadata_file)
+
+    # Get the top 50 `sirna_id`s by frequency
+    top_sirna_ids = data["sirna_id"].value_counts().head(50).index.tolist()
+
+    # Define cell types to distribute data based on them for each client
+    cell_types = ["RPE", "HUVEC", "HEPG2", "U2OS"]
+    output_files = [os.path.join(output_dir, f"meta_data_{i+1}.csv") for i in range(len(cell_types))]
+
+    # Filter and save data for each client
+    for cell_type, output_path in zip(cell_types, output_files):
+        filter_and_save_data(data, top_sirna_ids, cell_type, Path(output_path))
+
+    for i, metadata_path in enumerate(output_files):
+        metadata = pd.read_csv(metadata_path)
+
+        # Split the metadata into train and test datasets
+        train_metadata = metadata[metadata["dataset"] == "train"]
+        test_metadata = metadata[metadata["dataset"] == "test"]
+
+        train_tensor = process_data(train_metadata, dataset_dir)
+        save_to_pkl(train_tensor, os.path.join(output_dir, f"train_data_{i+1}.pkl"))
+
+        test_tensor = process_data(test_metadata, dataset_dir)
+        save_to_pkl(test_tensor, os.path.join(output_dir, f"test_data_{i+1}.pkl"))
+
+
+if __name__ == "__main__":
+    # Argument parsing
+    parser = argparse.ArgumentParser(description="Filter dataset by the most frequent sirna_id and cell_type.")
+    parser.add_argument("dataset_dir", type=str, help="Path to the dataset directory containing metadata.csv")
+
+    args = parser.parse_args()
+    main(args.dataset_dir)