Skip to content

Commit

Permalink
Merge pull request #95 from monarch-initiative/develop_add_protein_id
Browse files Browse the repository at this point in the history
Develop add protein
  • Loading branch information
msierk authored Jan 21, 2025
2 parents 2482477 + f4ed349 commit 6c7a079
Show file tree
Hide file tree
Showing 16 changed files with 488,077 additions and 25 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ c2p_env/
!src/oncoexporter/ncit_mapping_files/*.tsv
!src/oncoexporter/ncit_mapping_files/*.csv

*.tar.gz
#*.tar.gz


# Mac General
Expand Down
Binary file added cohorts/Bone.tar.gz
Binary file not shown.
Binary file added cohorts/Brain.tar.gz
Binary file not shown.
Binary file added cohorts/Breast.tar.gz
Binary file not shown.
Binary file added cohorts/Cervix.tar.gz
Binary file not shown.
Binary file added cohorts/Colon.tar.gz
Binary file not shown.
Binary file added cohorts/Heart.tar.gz
Binary file not shown.
Binary file added cohorts/Kidney.tar.gz
Binary file not shown.
Binary file added cohorts/Liver.tar.gz
Binary file not shown.
Binary file added cohorts/Lung.tar.gz
Binary file not shown.
Binary file added cohorts/Pancreas.tar.gz
Binary file not shown.
Binary file added cohorts/Skin.tar.gz
Binary file not shown.
Binary file added cohorts/Thyroid.tar.gz
Binary file not shown.
124 changes: 100 additions & 24 deletions src/oncoexporter/cda/_gdc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import json
import logging
import os
import tempfile
import typing
import urllib

import pandas as pd
import phenopackets as pp
Expand All @@ -23,7 +26,8 @@ def __init__(
self,
page_size=100,
page=1,
timeout=10,
timeout=30,
transcript_to_protein_url='https://ftp.ensembl.org/pub/current_tsv/homo_sapiens/Homo_sapiens.GRCh38.113.ena.tsv.gz'
):
self._logger = logging.getLogger(__name__)
self._variants_url = 'https://api.gdc.cancer.gov/ssms'
Expand All @@ -43,7 +47,7 @@ def __init__(
"tumor_allele",
# "genomic_dna_change",
# "end_position",
# "gene_aa_change",
"consequence.transcript.aa_change",
"consequence.transcript.gene.gene_id",
"consequence.transcript.gene.symbol",
"consequence.transcript.transcript_id",
Expand All @@ -54,6 +58,23 @@ def __init__(
"diagnoses.ajcc_pathologic_stage",
))

# Use a temporary directory to download the file
with tempfile.TemporaryDirectory() as tmpdir:
local_transcript_file = os.path.join(tmpdir,
'Homo_sapiens.GRCh38.113.ena.tsv.gz')

self._logger.info(
f"Downloading {local_transcript_file} from {transcript_to_protein_url}...")
urllib.request.urlretrieve(transcript_to_protein_url,
local_transcript_file)
self._logger.info(f"Downloaded and saved {local_transcript_file}.")

# Load the Ensembl transcript to protein mappings
self._ensembl_tx2prot = pd.read_csv(local_transcript_file, sep="\t")
self._tx_to_prot_dict = dict(
zip(self._ensembl_tx2prot.transcript_stable_id,
self._ensembl_tx2prot.protein_stable_id))

def _fetch_data_from_gdc(self, url: str, subject_id: str, fields: typing.List[str]=None) -> typing.Any:
params = self._prepare_query_params(subject_id, fields)
response = requests.get(url, params=params, timeout=self._timeout)
Expand All @@ -74,9 +95,15 @@ def _prepare_query_params(self, subject_ids: typing.List, fields: typing.List[st
"value": [subject_ids]
}
}
# filters = {"op":"and","content":[{"op":"in","content":{"field":"submitter_id","value":subj}}]}

# To avoid this error:
# requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
headers = {
'User-Agent': 'My User Agent 1.0',
}

return {
"headers": headers,
"fields": fields,
"filters": json.dumps(filters),
"format": "JSON",
Expand Down Expand Up @@ -195,24 +222,26 @@ def fetch_stage_dict(self) -> dict:

# return stage_data
def _map_mutation_to_variant_interpretation(self, mutation) -> pp.VariantInterpretation:
vcf_record = self._parse_vcf_record(mutation)

# TODO: 't_depth', 't_ref_count', 't_alt_count', 'n_depth', 'n_ref_count', 'n_alt_count'
# TODO: mutation status
# "gene_aa_change": ["KRAS G12D"]

vd = pp.VariationDescriptor()
vd.id = mutation['id']

vcf_record = self._parse_vcf_record(mutation)
if vcf_record is not None:
vd.vcf_record.CopyFrom(vcf_record)

# TODO: 't_depth', 't_ref_count', 't_alt_count', 'n_depth', 'n_ref_count', 'n_alt_count'
# TODO: mutation status

for csq in mutation['consequence']:

(expression, gene_descriptor) = (
GdcMutationService._map_consequence_to_expression_and_gene_descriptor(csq))

if expression is not None:
vd.expressions.append(expression)
expression_list = self._map_consequence_to_expression(csq)# GdcMutationService._map_consequence_to_expression(csq)
gene_descriptor = GdcMutationService._map_consequence_to_gene_descriptor(csq)

if expression_list is not None:
vd.expressions.extend(expression_list)

if gene_descriptor is not None:
vd.gene_context.CopyFrom(gene_descriptor)

Expand Down Expand Up @@ -240,24 +269,71 @@ def _parse_vcf_record(self, mutation) -> typing.Optional[pp.VcfRecord]:

return vcf_record

@staticmethod
def _map_consequence_to_expression_and_gene_descriptor(csq) -> (typing.Optional[pp.Expression],
typing.Optional[pp.GeneDescriptor]):
#@staticmethod # [pp.Expression]
def _map_consequence_to_expression(self, csq) -> typing.Optional[list]:

tx = csq['transcript']
print(tx)

# "consequence.transcript.gene.gene_id",
# "consequence.transcript.gene.symbol",

expression = pp.Expression()
expression.syntax = 'hgvs.c'

expression_c = pp.Expression()
expression_c.syntax = 'hgvs.c'

tx_id = tx['transcript_id']
ann = tx['annotation']['hgvsc']
expression.value = f'{tx_id}:{ann}'
expression_c.value = f'{tx_id}:{ann}'

expression_p = pp.Expression()
prot_id = None
if tx_id in self._tx_to_prot_dict:
prot_id = self._tx_to_prot_dict[tx_id]

aa_change = None
if 'aa_change' in tx:
aa_change = 'p.'+tx['aa_change']

if aa_change is not None and prot_id is not None:
expression_p.syntax = 'hgvs.p'
expression_p.value = f'{prot_id}:{aa_change}'


'''
Phenopacket:
"expressions": [
{
"syntax": "hgvs.c",
"value": "ENST00000373125:c.55C>T"
},
GDC:
curl 'https://api.gdc.cancer.gov/ssms/edd1ae2c-3ca9-52bd-a124-b09ed304fcc2?pretty=true&expand=consequence.transcript'
{
"data": {
"start_position": 25245350,
"gene_aa_change": [
"KRAS G12D"
],
"consequence": [
{
"transcript": {
"transcript_id": "ENST00000556131",
"aa_end": 12,
"consequence_type": "missense_variant",
"aa_start": 12,
"is_canonical": false,
"aa_change": "G12D",
"ref_seq_accession": ""
}
},
'''
return ([expression_c,expression_p])

@staticmethod
def _map_consequence_to_gene_descriptor(csq) -> (typing.Optional[pp.GeneDescriptor]):

tx = csq['transcript']

gene_context = pp.GeneDescriptor()
gene_context.value_id = tx['gene']['gene_id']
gene_context.symbol = tx['gene']['symbol']

return (expression, gene_context)
return(gene_context)

Loading

0 comments on commit 6c7a079

Please sign in to comment.