From 3e2d3ad0cd0270468f92412965a5fc1495ff4dcf Mon Sep 17 00:00:00 2001 From: "Brian M. Schilder" <34280215+bschilder@users.noreply.github.com> Date: Wed, 20 Mar 2024 18:51:28 +0000 Subject: [PATCH] update csl report --- reports/CSL.Rmd | 151 +++++++++-- reports/CSL.html | 640 ++++++++++++++++++++++++++++++----------------- 2 files changed, 534 insertions(+), 257 deletions(-) diff --git a/reports/CSL.Rmd b/reports/CSL.Rmd index d195c24..d86e5eb 100644 --- a/reports/CSL.Rmd +++ b/reports/CSL.Rmd @@ -3,7 +3,12 @@ title: "Rare Disease Celltyping" subtitle: "CSL Areas of Interest" author: "Brian M. Schilder" date: "
Here, we wanted to map CSL areas +of interest onto the Human Phenotype Ontology (HPO) and/or other disease +ontologies (OMIM/ORPH). This allowed us to connect our phenotype-cell +type association results to the CSL areas of interest. For a description +of our methodology, see here:
++Identification of cell type-specific gene targets underlying +thousands of rare diseases and subtraits Kitty B. Murphy, Robert +Gordon-Smith, Jai Chapman, Momoko Otani, Brian M. Schilder, Nathan G. +Skene https://www.medrxiv.org/content/10.1101/2023.02.13.23285820v1
+
CSL areas of interest mapped onto Human Phenotype Ontology (HPO) -and/or other disease ontologies (OMIM/ORPH).
+First, we:
+Gathered all diseases/phenotypes that CSL lists on their website +and/or the Research Accelerator Initiative materials.
For each disease/phenotype, we assigned a Group based on whether +CSL has expressed interest in Early Stage Partnering with external labs +(e.g. via the RAI) or whether they are already actively pursuing +research in these fields.
We then grouped each disease/phenotype into a broader Area +(“Immunology”) and a particular disease Subarea +(“Dermatomyositis”).
Next, we mapped each disease/phenotype onto a standardised HPO ID +or OMIM/Orphanet ID.
csl <- data.table::fread(here::here("data/CSL_areas_of_interest.tsv"))
-csl <- csl[Group=="Early Stage Partnering"]
+# csl <- csl[Group=="Early Stage Partnering"]
# extract IDs from the Entry column of csl data.table with the pattern "HP:", "OMIM:", "ORPHA:" and put into a new col
csl[, ID := stringr::str_extract(Entry, "HP:\\d+|OMIM:\\d+|ORPHA:\\d+")]
# extract names from Entry column WITHOUT the ID
csl[, Name := trimws(stringr::str_remove(Entry, "HP:\\d+|OMIM:\\d+|ORPHA:\\d+"))]
MSTExplorer::create_dt(csl)
-
-
-Extract IDs of phenotypes and diseases. Get any descendants of -manually mapped HPO IDs as well.
+ + +Next, I took all the HPO IDs and expanded this list to any HPO terms +that are descendants of the original HPO IDs. This allows us to capture +any phenotypes that are more specific than the original CSL HPO IDs.
hpo_ids <- csl[grepl("HP:",ID)]$ID |> unique()
disease_ids <- csl[!grepl("HP:",ID)]$ID |> unique()
hpo <- KGExplorer::get_ontology("hp")
@@ -76780,6 +76892,9 @@ Import CSL areas of interest
Next, I imported the results of our phenotype-cell type association
+analysis. This analysis was performed using the MSTExplorer
+package and the results are stored in a data.table.
results <- MSTExplorer::load_example_results()
results <- HPOExplorer::add_hpo_name(results, hpo = hpo)
results <- HPOExplorer::add_ont_lvl(results)
@@ -76793,6 +76908,13 @@ Import phenotype-cell type association results
We developed a pipeline to help filter down our results to identif +the promise promising gene targets for the most severe phenotypes. It +includes a number of different steps, including filtering on cell type +specific gene expression and the evidence supporting the causal +relationships between each gene and a phenotype.
+Ultimately, it allows us to trace multi-scale disease mechanisms from +genes –> cell types –> phenotypes –> diseases
prioritise_targets_out <- MSTExplorer::prioritise_targets(
results = results,
hpo = hpo,
@@ -76819,7 +76941,21 @@ Prioritise targets
MSTExplorer::create_dt(head(targets,100))
-Summarise results:
+Here we summarise the percentage of CSL phenotypes of interest (with +HPO IDs) included in our prioritised cell type-specific targets. We also +compute the proportion of CSL diseases (with OMIM/Orphanet IDs) that +have at least one of those phenotypes as a symptom.
+message(paste0(
+ length(intersect(results$hpo_id,hpo_ids_extended)),"/",
+ length(unique(hpo_ids_extended)),
+ " (",round(length(intersect(hpo_ids_extended,results$hpo_id))/
+ length(unique(hpo_ids_extended))*100,2),"%)",
+ " CSL phenotypes",
+ " covered in our phenotype-cell type association results."
+))
+## 1166/1860 (62.69%) CSL phenotypes covered in our phenotype-cell type association results.
message(paste0(
length(intersect(all_targets$hpo_id,hpo_ids_extended)),"/",
length(unique(hpo_ids_extended)),
@@ -76829,15 +76965,16 @@ Prioritise targets
" covered in our prioritised targets."
))
## 594/1860 (31.94%) CSL phenotypes (across 4850 diseases) covered in our prioritised targets.
+top_targets_ancestor <- targets[!duplicated(ancestor_name),]
-csl_areas <- unique(csl[,c("ID","Area","Subarea")])
+From our list of prioritised cell type-specific gene targets, we can
+now select the top targets for each CSL area of interest.
+We have arbitrarily set a cutoff of 3 targets per area, but this can
+easily be adjusted as needed.
+top_n <- 3
+# top_targets_ancestor <- targets[!duplicated(ancestor_name),]
+csl_areas <- unique(csl[,c("ID","Area","Subarea")])
targets2 <- targets |>
merge(csl_areas,
all.x=TRUE,
@@ -76849,7 +76986,7 @@ Per CSL area of interest
targets2[,Subarea:=data.table::fcoalesce(Subarea.x,Subarea.y)]
#### Select top N targets per CSL Area ####
num_cols <- sapply(targets2,is.numeric)
-top_targets <- targets2[!is.na(Area), head(.SD,3),
+top_targets <- targets2[!is.na(Area), head(.SD,top_n),
by=c("Area")]
# drop list columns
# save_path <- here::here("reports/top_targets_CSL.tsv")
@@ -76858,8 +76995,10 @@ Per CSL area of interest
# top_targets <- data.table::fread(here::here("top_targets_CSL.tsv"))
MSTExplorer::create_dt(top_targets)
-
-
+
+
+We can also count the number of targets, genes, phenotypes and
+diseases per area of interest.
target_counts <- targets2[,list(n_targets=.N,
n_genes=data.table::uniqueN(gene_symbol),
n_phenotypes=data.table::uniqueN(hpo_id),
@@ -76868,27 +77007,51 @@ Per CSL area of interest
data.table::setorderv("n_targets",-1)
MSTExplorer::create_dt(target_counts)
-
-
-
Now that we have the top candidate targets for CSL disease areas, we +can explore them in more detail. For example, we can use additional +resources (ClinVar, Variant Effect Predictor, gnomad) to find out which +variants are deleterious within (or around) the gene targets. We can +also gathered population-level data on the frequency of these variants, +giving us a better idea of the number of patients that may benefit from +treating this particular mechanism.
Check for other results with the same cell type and gene target.
+Let’s first explore phenotypes related to “Stroke”.
+As an example, we’re going to look specifically at the role of +COL4A1 in stroke.
stroke_targets <- targets2[grepl("stroke",hpo_name, ignore.case = TRUE)]
stroke_targets <- stroke_targets[gene_symbol=="COL4A1"]
+https://gnomad.broadinstitute.org/gene/ENSG00000187498?dataset=gnomad_r4
-COL4A1 exonn positions: https://www.ensembl.org/Homo_sapiens/Transcript/Exons?db=core;g=ENSG00000187498;r=13:110213499-110214499;t=ENST00000375820;v=rs34004222;vdb=variation;vf=813354076
-Exon 3: chr13:110213926-110214015
+We gathered additional data from the gnomad website on the +COL4A1 gene (i.e. ENSG00000187498).
+Our goal here was to:
+Identify confirmed pathogenic variants in +COL4A1.
Check the frequency of these variants in the general +population.
Identify a particular exon that has the highest frequency of +pathogenic variants. The idea being that this exon could be a good +target for therapeutic intervention via ASO-induced +exon-skipping.
Additional info:
+Exon 3 spans the following genomic coordinates: +chr13:110213926-110214015
#### Import ####
gnomad <- data.table::fread(here::here("data/gnomAD_v4.0.0_ENSG00000187498_2024_02_08_11_13_20.csv"),
check.names = TRUE)
@@ -76927,8 +77090,20 @@ gnomad
"exon_pathovariants_perbp",
"exon_pathovariants_cumfreq")] |>
MSTExplorer::create_dt()
-
-
+
+
+We found two exons with the highest cumulative frequency of +pathogenic variants in the COL4A1 gene. Of these, exon 3 has +the highest cumulative frequency of pathogenic variants.
+Follow up research into the existing literature was carried out (not +shown here), to confirm whether:
+any similar therapies currently existing for this gene
exon 3 could be safely skipped
there are any animal models available for the homologous exons in +this gene