Merge pull request #95 from monarch-initiative/develop_add_protein_id

Develop add protein
monarch-initiative · Jan 21, 2025 · 6c7a079 · 6c7a079
2 parents 2482477 + f4ed349
commit 6c7a079
Show file tree

Hide file tree

Showing 16 changed files with 488,077 additions and 25 deletions.
diff --git a/.gitignore b/.gitignore
@@ -174,7 +174,7 @@ c2p_env/
 !src/oncoexporter/ncit_mapping_files/*.tsv
 !src/oncoexporter/ncit_mapping_files/*.csv
 
-*.tar.gz
+#*.tar.gz
 
 
 # Mac General

diff --git a/cohorts/Bone.tar.gz b/cohorts/Bone.tar.gz
diff --git a/cohorts/Brain.tar.gz b/cohorts/Brain.tar.gz
diff --git a/cohorts/Breast.tar.gz b/cohorts/Breast.tar.gz
diff --git a/cohorts/Cervix.tar.gz b/cohorts/Cervix.tar.gz
diff --git a/cohorts/Colon.tar.gz b/cohorts/Colon.tar.gz
diff --git a/cohorts/Heart.tar.gz b/cohorts/Heart.tar.gz
diff --git a/cohorts/Kidney.tar.gz b/cohorts/Kidney.tar.gz
diff --git a/cohorts/Liver.tar.gz b/cohorts/Liver.tar.gz
diff --git a/cohorts/Lung.tar.gz b/cohorts/Lung.tar.gz
diff --git a/cohorts/Pancreas.tar.gz b/cohorts/Pancreas.tar.gz
diff --git a/cohorts/Skin.tar.gz b/cohorts/Skin.tar.gz
diff --git a/cohorts/Thyroid.tar.gz b/cohorts/Thyroid.tar.gz
diff --git a/src/oncoexporter/cda/_gdc.py b/src/oncoexporter/cda/_gdc.py
@@ -1,6 +1,9 @@
 import json
 import logging
+import os
+import tempfile
 import typing
+import urllib
 
 import pandas as pd
 import phenopackets as pp
@@ -23,7 +26,8 @@ def __init__(
             self,
             page_size=100,
             page=1,
-            timeout=10,
+            timeout=30,
+            transcript_to_protein_url='https://ftp.ensembl.org/pub/current_tsv/homo_sapiens/Homo_sapiens.GRCh38.113.ena.tsv.gz'
     ):
         self._logger = logging.getLogger(__name__)
         self._variants_url = 'https://api.gdc.cancer.gov/ssms'
@@ -43,7 +47,7 @@ def __init__(
             "tumor_allele",
             # "genomic_dna_change",
             # "end_position",
-            # "gene_aa_change",
+            "consequence.transcript.aa_change",
             "consequence.transcript.gene.gene_id",
             "consequence.transcript.gene.symbol",
             "consequence.transcript.transcript_id",
@@ -54,6 +58,23 @@ def __init__(
             "diagnoses.ajcc_pathologic_stage",
         ))
 
+        # Use a temporary directory to download the file
+        with tempfile.TemporaryDirectory() as tmpdir:
+            local_transcript_file = os.path.join(tmpdir,
+                                                 'Homo_sapiens.GRCh38.113.ena.tsv.gz')
+
+            self._logger.info(
+                f"Downloading {local_transcript_file} from {transcript_to_protein_url}...")
+            urllib.request.urlretrieve(transcript_to_protein_url,
+                                       local_transcript_file)
+            self._logger.info(f"Downloaded and saved {local_transcript_file}.")
+
+            # Load the Ensembl transcript to protein mappings
+            self._ensembl_tx2prot = pd.read_csv(local_transcript_file, sep="\t")
+            self._tx_to_prot_dict = dict(
+                zip(self._ensembl_tx2prot.transcript_stable_id,
+                    self._ensembl_tx2prot.protein_stable_id))
+
     def _fetch_data_from_gdc(self, url: str, subject_id: str, fields: typing.List[str]=None) -> typing.Any:
         params = self._prepare_query_params(subject_id, fields)
         response = requests.get(url, params=params, timeout=self._timeout)
@@ -74,9 +95,15 @@ def _prepare_query_params(self, subject_ids: typing.List, fields: typing.List[st
                 "value": [subject_ids]
             }
         }
-        # filters = {"op":"and","content":[{"op":"in","content":{"field":"submitter_id","value":subj}}]}
+
+        # To avoid this error:
+        # requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
+        headers = {
+            'User-Agent': 'My User Agent 1.0',
+        }
 
         return {
+            "headers": headers,
             "fields": fields,
             "filters": json.dumps(filters),
             "format": "JSON",
@@ -195,24 +222,26 @@ def fetch_stage_dict(self) -> dict:
 
     #    return stage_data
     def _map_mutation_to_variant_interpretation(self, mutation) -> pp.VariantInterpretation:
-        vcf_record = self._parse_vcf_record(mutation)
+
+        # TODO: 't_depth', 't_ref_count', 't_alt_count', 'n_depth', 'n_ref_count', 'n_alt_count'
+        # TODO: mutation status
+        # "gene_aa_change": ["KRAS G12D"]
 
         vd = pp.VariationDescriptor()
         vd.id = mutation['id']
 
+        vcf_record = self._parse_vcf_record(mutation)
         if vcf_record is not None:
             vd.vcf_record.CopyFrom(vcf_record)
 
-        # TODO: 't_depth', 't_ref_count', 't_alt_count', 'n_depth', 'n_ref_count', 'n_alt_count'
-        # TODO: mutation status
-
         for csq in mutation['consequence']:
-
-            (expression, gene_descriptor) = (
-                GdcMutationService._map_consequence_to_expression_and_gene_descriptor(csq))
 
-            if expression is not None:
-                vd.expressions.append(expression)
+            expression_list = self._map_consequence_to_expression(csq)# GdcMutationService._map_consequence_to_expression(csq)
+            gene_descriptor = GdcMutationService._map_consequence_to_gene_descriptor(csq)
+
+            if expression_list is not None:
+                vd.expressions.extend(expression_list)
+
             if gene_descriptor is not None:
                 vd.gene_context.CopyFrom(gene_descriptor)
 
@@ -240,24 +269,71 @@ def _parse_vcf_record(self, mutation) -> typing.Optional[pp.VcfRecord]:
 
         return vcf_record
 
-    @staticmethod
-    def _map_consequence_to_expression_and_gene_descriptor(csq) -> (typing.Optional[pp.Expression],
-                                                                    typing.Optional[pp.GeneDescriptor]):
+    #@staticmethod # [pp.Expression]
+    def _map_consequence_to_expression(self, csq) -> typing.Optional[list]:
+
         tx = csq['transcript']
-        print(tx)
-
-        #     "consequence.transcript.gene.gene_id",
-        #    "consequence.transcript.gene.symbol",
-
-        expression = pp.Expression()
-        expression.syntax = 'hgvs.c'
+
+        expression_c = pp.Expression()
+        expression_c.syntax = 'hgvs.c'
+
         tx_id = tx['transcript_id']
         ann = tx['annotation']['hgvsc']
-        expression.value = f'{tx_id}:{ann}'
+        expression_c.value = f'{tx_id}:{ann}'
+
+        expression_p = pp.Expression()
+        prot_id = None
+        if tx_id in self._tx_to_prot_dict:
+            prot_id = self._tx_to_prot_dict[tx_id]
+
+        aa_change = None
+        if 'aa_change' in tx:
+            aa_change = 'p.'+tx['aa_change']
+
+        if aa_change is not None and prot_id is not None:
+            expression_p.syntax = 'hgvs.p'
+            expression_p.value = f'{prot_id}:{aa_change}'
+
+
+        '''
+        Phenopacket:
+        "expressions": [
+                  {
+                    "syntax": "hgvs.c",
+                    "value": "ENST00000373125:c.55C>T"
+                  },
+        
+        GDC:
+        curl 'https://api.gdc.cancer.gov/ssms/edd1ae2c-3ca9-52bd-a124-b09ed304fcc2?pretty=true&expand=consequence.transcript'
+        {
+            "data": {
+                "start_position": 25245350,
+                "gene_aa_change": [
+                    "KRAS G12D"
+                ],
+            "consequence": [
+            {
+                "transcript": {
+                    "transcript_id": "ENST00000556131",
+                    "aa_end": 12,
+                    "consequence_type": "missense_variant",
+                    "aa_start": 12,
+                    "is_canonical": false,
+                    "aa_change": "G12D",
+                    "ref_seq_accession": ""
+            }
+        },
+        '''
+        return ([expression_c,expression_p])
+
+    @staticmethod
+    def _map_consequence_to_gene_descriptor(csq) -> (typing.Optional[pp.GeneDescriptor]):
+
+        tx = csq['transcript']
 
         gene_context = pp.GeneDescriptor()
         gene_context.value_id = tx['gene']['gene_id']
         gene_context.symbol = tx['gene']['symbol']
 
-        return (expression, gene_context)
+        return(gene_context)