fixed getting variants from GDC

monarch-initiative · Apr 11, 2024 · ac183ca · ac183ca
1 parent 8c7b2ca
commit ac183ca
Show file tree

Hide file tree

Showing 12 changed files with 53,696 additions and 85,304 deletions.
diff --git a/diagnosis_df.txt b/diagnosis_df.txt
diff --git a/diagnosis_rsub.txt b/diagnosis_rsub.txt
diff --git a/interpretations_list.txt b/interpretations_list.txt
diff --git a/notebooks/Biosample_test.ipynb b/notebooks/Biosample_test.ipynb
@@ -377,6 +377,35 @@
     "treatment = fetch_rows(table='treatment', match_any=['primary_diagnosis_site = *lung*' , 'primary_diagnosis_site = *pulmonary*'], add_columns=['subject_id'])\n",
     "treatment.head()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#from oncoexporter.cda import CdaTableImporter\n",
+    "from oncoexporter.cda._gdc import GdcMutationService\n",
+    "subject_id_list = ['TCGA-05-4244', 'TCGA-05-4245', 'TCGA-05-4249', 'TCGA-05-4250', 'TCGA-05-4382',\n",
+    "                   'TCGA-05-4384', 'TCGA-05-4389', 'TCGA-05-4390', 'TCGA-05-4395', 'TCGA-05-4396']\n",
+    "subject_id = 'AP-CT3G' #'4d_lung.100_HM10395' # 'TCGA-DU-6407' CPTAC.C3N-01409\n",
+    "gdc_timeout=int(100_000)\n",
+    "gdc_mutation_service = GdcMutationService(timeout=gdc_timeout)\n",
+    "\n",
+    "variant_interpretations = gdc_mutation_service.fetch_variants(subject_id)\n",
+    "variant_interpretations"
+   ]
   }
  ],
  "metadata": {

diff --git a/notebooks/GDC_mutations_test.ipynb b/notebooks/GDC_mutations_test.ipynb
@@ -0,0 +1,58 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'TCGA-05-4250'"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    " '''       {\n",
+    "            \"op\": \"in\",\n",
+    "            \"content\": {\n",
+    "                \"field\": \"case.available_variation_data\",\n",
+    "                \"value\": [\n",
+    "                    \"ssm\"\n",
+    "                ]\n",
+    "            }\n",
+    "        },\n",
+    "'''\n",
+    "import re\n",
+    "string = 'TCGA.TCGA-05-4250'\n",
+    "string = 'TCGA-05-4250'\n",
+    "re.sub(\"^[^.]+\\.\", \"\", string) # ^(.*?)\\..*   ^[^.]*\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Phenopackets",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/rsub_diagnosis_df.txt b/rsub_diagnosis_df.txt
diff --git a/scripts/run_lung.py b/scripts/run_lung.py
@@ -15,14 +15,18 @@
 fetch_rows( table='subject', match_all=[ 'primary_disease_type = *duct*', 'sex = F*' ] )
 fetch_rows( table='researchsubject', match_all=[ 'primary_diagnosis_site = NULL' ] )
 '''
-table_importer: CdaTableImporter = configure_cda_table_importer(use_cache=True)
+######   Input parameters  ########
+table_importer: CdaTableImporter = configure_cda_table_importer(use_cache=False)
 
 #Tsite = Q('primary_diagnosis_site = "%lung%" OR primary_diagnosis_site = "%pulmonary%"')
 # b = {'x':42, 'y':None}
 # function(1, **b) # equal to function(1, x=42, y=None)
-Tsite = {'match_any': ['primary_diagnosis_site = *lung*' , 'primary_diagnosis_site = *pulmonary*']}
+Query = {'match_any': ['primary_diagnosis_site = *lung*' , 'primary_diagnosis_site = *pulmonary*'],
+         'data_source': 'GDC'}
 cohort_name = 'Lung'
-p = table_importer.get_ga4gh_phenopackets(Tsite, cohort_name=cohort_name)
+#################################### 
+
+p = table_importer.get_ga4gh_phenopackets(Query, cohort_name=cohort_name)
 
 result_dir = os.path.abspath(os.path.join('phenopackets', cohort_name))
 os.makedirs(result_dir, exist_ok=True)

diff --git a/specimen_df.txt b/specimen_df.txt
diff --git a/src/oncoexporter/cda/_gdc.orig.py b/src/oncoexporter/cda/_gdc.orig.py
@@ -0,0 +1,127 @@
+import json
+import logging
+import typing
+
+import phenopackets as pp
+import requests
+
+
+class GdcMutationService:
+    """
+    `GdcMutationService` queries Genomic Data Commons REST endpoint to fetch variants for a CDA subject.
+    """
+
+    def __init__(
+            self,
+            page_size=100,
+            page=1,
+            timeout=10,
+    ):
+        self._logger = logging.getLogger(__name__)
+        self._url = 'https://api.gdc.cancer.gov/ssms'
+        self._page_size = page_size
+        self._page = page
+        self._timeout = timeout
+        self._fields = ','.join((
+            # "mutation_type",
+            # "mutation_subtype",
+            "ncbi_build",
+            "chromosome",
+            "start_position",
+            # "end_position",
+            "reference_allele",
+            "tumor_allele",
+            # "genomic_dna_change",
+            # "end_position",
+            # "gene_aa_change",
+            "consequence.transcript.gene.gene_id",
+            "consequence.transcript.gene.symbol",
+            "consequence.transcript.transcript_id",
+            "consequence.transcript.annotation.hgvsc",
+        ))
+
+    def fetch_variants(self, subject_id: str) -> typing.Sequence[pp.VariantInterpretation]:
+        params = self._prepare_query_params(subject_id)
+
+        response = requests.get(self._url, params=params, timeout=self._timeout)
+
+        if response.status_code == 200:
+            data = response.json()
+            mutations = data.get("data", {}).get("hits", [])
+
+            mutation_details = []
+            for mutation in mutations:
+                vi = self._map_mutation_to_variant_interpretation(mutation)
+                mutation_details.append(vi)
+
+            return mutation_details
+        else:
+            raise ValueError(f'Failed to fetch data due to {response.status_code}: {response.reason}')
+
+    def _prepare_query_params(self, subject_id: str):
+        filters = {
+            "op": "in",
+            "content": {
+                "field": "cases.submitter_id",
+                "value": [subject_id]
+            }
+        }
+        return {
+            "fields": self._fields,
+            "filters": json.dumps(filters),
+            "format": "JSON",
+            "size": self._page_size,
+        }
+
+    def _map_mutation_to_variant_interpretation(self, mutation) -> pp.VariantInterpretation:
+        vcf_record = self._parse_vcf_record(mutation)
+
+        vd = pp.VariationDescriptor()
+        vd.id = mutation['id']
+        if vcf_record is not None:
+            vd.vcf_record.CopyFrom(vcf_record)
+
+        # TODO: set gene
+        # TODO: 't_depth', 't_ref_count', 't_alt_count', 'n_depth', 'n_ref_count', 'n_alt_count'
+        # TODO: mutation status
+
+        for csq in mutation['consequence']:
+            expression = GdcMutationService._map_consequence_to_expression(csq)
+            if expression is not None:
+                vd.expressions.append(expression)
+
+        vd.molecule_context = pp.MoleculeContext.genomic
+
+        vi = pp.VariantInterpretation()
+        vi.variation_descriptor.CopyFrom(vd)
+        return vi
+
+    def _parse_vcf_record(self, mutation) -> typing.Optional[pp.VcfRecord]:
+        if mutation['reference_allele'] == '-' or mutation['tumor_allele'] == '-':
+            self._logger.debug(
+                'Cannot create a VCF record due to missing bases in the reference_allele/tumor_allele alleles: %s',
+                mutation)
+            return None
+
+        vcf_record = pp.VcfRecord()
+
+        vcf_record.genome_assembly = mutation['ncbi_build']
+        vcf_record.chrom = mutation['chromosome']
+        vcf_record.id = mutation['id']
+        vcf_record.pos = mutation['start_position']
+        vcf_record.ref = mutation['reference_allele']
+        vcf_record.alt = mutation['tumor_allele']
+
+        return vcf_record
+
+    @staticmethod
+    def _map_consequence_to_expression(csq) -> typing.Optional[pp.Expression]:
+        tx = csq['transcript']
+
+        expression = pp.Expression()
+        expression.syntax = 'hgvs.c'
+        tx_id = tx['transcript_id']
+        ann = tx['annotation']['hgvsc']
+        expression.value = f'{tx_id}:{ann}'
+
+        return expression
diff --git a/src/oncoexporter/cda/_gdc.py b/src/oncoexporter/cda/_gdc.py
@@ -18,11 +18,13 @@ def __init__(
             timeout=10,
     ):
         self._logger = logging.getLogger(__name__)
-        self._url = 'https://api.gdc.cancer.gov/ssms'
+        self._variants_url = 'https://api.gdc.cancer.gov/ssms'
+        self._survival_url = 'https://api.gdc.cancer.gov/analysis/survival'
+        self._cases = 'https://api.gdc.cancer.gov/cases'
         self._page_size = page_size
         self._page = page
         self._timeout = timeout
-        self._fields = ','.join((
+        self._variant_fields = ','.join((
             # "mutation_type",
             # "mutation_subtype",
             "ncbi_build",
@@ -39,26 +41,20 @@ def __init__(
             "consequence.transcript.transcript_id",
             "consequence.transcript.annotation.hgvsc",
         ))
+        self._case_fields = ','.join((
+            "demographic.vital_status",
+        ))
 
-    def fetch_variants(self, subject_id: str) -> typing.Sequence[pp.VariantInterpretation]:
-        params = self._prepare_query_params(subject_id)
-
-        response = requests.get(self._url, params=params, timeout=self._timeout)
-
+    def _fetch_data_from_gdc(self, url: str, subject_id: str, fields: typing.List[str]=None) -> typing.Any:
+        params = self._prepare_query_params(subject_id, fields)
+        response = requests.get(url, params=params, timeout=self._timeout)
         if response.status_code == 200:
             data = response.json()
-            mutations = data.get("data", {}).get("hits", [])
-
-            mutation_details = []
-            for mutation in mutations:
-                vi = self._map_mutation_to_variant_interpretation(mutation)
-                mutation_details.append(vi)
-
-            return mutation_details
+            return data
         else:
-            raise ValueError(f'Failed to fetch data due to {response.status_code}: {response.reason}')
-
-    def _prepare_query_params(self, subject_id: str):
+            raise ValueError(f'Failed to fetch data from {url} due to {response.status_code}: {response.reason}')
+    
+    def _prepare_query_params(self, subject_id: str, fields: typing.List[str]=None) -> typing.Dict:
         filters = {
             "op": "in",
             "content": {
@@ -67,12 +63,53 @@ def _prepare_query_params(self, subject_id: str):
             }
         }
         return {
-            "fields": self._fields,
+            "fields": fields,
             "filters": json.dumps(filters),
             "format": "JSON",
             "size": self._page_size,
         }
 
+    def fetch_variants(self, subject_id: str) -> typing.Sequence[pp.VariantInterpretation]:
+        variants = self._fetch_data_from_gdc(self._variants_url, subject_id, self._variant_fields)
+
+        mutations = variants.get("data", {}).get("hits", [])
+
+        mutation_details = []
+        for mutation in mutations:
+            vi = self._map_mutation_to_variant_interpretation(mutation)
+            mutation_details.append(vi)
+
+        return mutation_details
+
+    def fetch_vital_status(self, subject_id: str) -> pp.VitalStatus:
+        survival_data = self._fetch_data_from_gdc(self._survival_url, subject_id)
+        vital_status_data = self._fetch_data_from_gdc(self._cases, subject_id, self._case_fields)
+
+        survival_time = None
+        vital_status = None
+
+        survival_results = survival_data.get("results", [])
+        if survival_results:
+            donors = survival_results[0].get("donors", [])
+            if donors:
+                survival_time = donors[0].get("time")
+
+        vital_status_hits = vital_status_data.get("data", {}).get("hits", [])
+        if vital_status_hits:
+            demographic = vital_status_hits[0].get("demographic", {})
+            vital_status = demographic.get("vital_status")
+
+        vital_status_obj = pp.VitalStatus()
+        vital_status_obj.survival_time_in_days = int(survival_time) if survival_time is not None else 0
+        if vital_status == "Dead":
+            vital_status_obj.status = pp.VitalStatus.Status.DECEASED
+        elif vital_status == "Alive":
+            vital_status_obj.status = pp.VitalStatus.Status.ALIVE
+        else:
+            vital_status_obj.status = pp.VitalStatus.Status.UNKNOWN_STATUS
+
+        return vital_status_obj
+
     def _map_mutation_to_variant_interpretation(self, mutation) -> pp.VariantInterpretation:
         vcf_record = self._parse_vcf_record(mutation)
 
@@ -124,4 +161,4 @@ def _map_consequence_to_expression(csq) -> typing.Optional[pp.Expression]:
         ann = tx['annotation']['hgvsc']
         expression.value = f'{tx_id}:{ann}'
 
-        return expression
+        return expression
diff --git a/src/oncoexporter/cda/cda_table_importer.py b/src/oncoexporter/cda/cda_table_importer.py
@@ -314,11 +314,21 @@ def get_ga4gh_phenopackets(self, source: dict, **kwargs) -> typing.List[PPkt.Phe
             #  Note: as of 4-10-24, no way to access system/data source from result of fetch_rows (can specify in fetch_rows)
             #for rsub_subj in row["subject_identifier"]:
             #    if rsub_subj["system"] == "GDC":
-            variant_interpretations = self._gdc_mutation_service.fetch_variants(row["subject_id"]) # was rsub_subj['value']
+
+            # have to strip off the leading name before first '.'
+            # e.g. TCGA.TCGA-05-4250 -> TCGA-05-4250
+            import re
+
+            subj_id = re.sub("^[^.]+\.", "", row["subject_id"])
+            print(row["subject_id"], subj_id)
 
+            variant_interpretations = self._gdc_mutation_service.fetch_variants(subj_id) # was rsub_subj['value']
             if len(variant_interpretations) == 0:
+                print("No variants found")
                 continue
-
+            else:
+                print("length variant_interpretations: {}".format(len(variant_interpretations)))
+
             # TODO: improve/enhance diagnosis term annotations
             diagnosis = PPkt.Diagnosis()
             diagnosis.disease.id = "NCIT:C3262"