Skip to content

Commit

Permalink
fixed getting variants from GDC
Browse files Browse the repository at this point in the history
  • Loading branch information
msierk committed Apr 11, 2024
1 parent 8c7b2ca commit ac183ca
Show file tree
Hide file tree
Showing 12 changed files with 53,696 additions and 85,304 deletions.
3,850 changes: 1,034 additions & 2,816 deletions diagnosis_df.txt

Large diffs are not rendered by default.

21,994 changes: 5,358 additions & 16,636 deletions diagnosis_rsub.txt

Large diffs are not rendered by default.

1,439 changes: 1,439 additions & 0 deletions interpretations_list.txt

Large diffs are not rendered by default.

29 changes: 29 additions & 0 deletions notebooks/Biosample_test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,35 @@
"treatment = fetch_rows(table='treatment', match_any=['primary_diagnosis_site = *lung*' , 'primary_diagnosis_site = *pulmonary*'], add_columns=['subject_id'])\n",
"treatment.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#from oncoexporter.cda import CdaTableImporter\n",
"from oncoexporter.cda._gdc import GdcMutationService\n",
"subject_id_list = ['TCGA-05-4244', 'TCGA-05-4245', 'TCGA-05-4249', 'TCGA-05-4250', 'TCGA-05-4382',\n",
" 'TCGA-05-4384', 'TCGA-05-4389', 'TCGA-05-4390', 'TCGA-05-4395', 'TCGA-05-4396']\n",
"subject_id = 'AP-CT3G' #'4d_lung.100_HM10395' # 'TCGA-DU-6407' CPTAC.C3N-01409\n",
"gdc_timeout=int(100_000)\n",
"gdc_mutation_service = GdcMutationService(timeout=gdc_timeout)\n",
"\n",
"variant_interpretations = gdc_mutation_service.fetch_variants(subject_id)\n",
"variant_interpretations"
]
}
],
"metadata": {
Expand Down
58 changes: 58 additions & 0 deletions notebooks/GDC_mutations_test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'TCGA-05-4250'"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
" ''' {\n",
" \"op\": \"in\",\n",
" \"content\": {\n",
" \"field\": \"case.available_variation_data\",\n",
" \"value\": [\n",
" \"ssm\"\n",
" ]\n",
" }\n",
" },\n",
"'''\n",
"import re\n",
"string = 'TCGA.TCGA-05-4250'\n",
"string = 'TCGA-05-4250'\n",
"re.sub(\"^[^.]+\\.\", \"\", string) # ^(.*?)\\..* ^[^.]*\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Phenopackets",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
17,147 changes: 5,415 additions & 11,732 deletions rsub_diagnosis_df.txt

Large diffs are not rendered by default.

10 changes: 7 additions & 3 deletions scripts/run_lung.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@
fetch_rows( table='subject', match_all=[ 'primary_disease_type = *duct*', 'sex = F*' ] )
fetch_rows( table='researchsubject', match_all=[ 'primary_diagnosis_site = NULL' ] )
'''
table_importer: CdaTableImporter = configure_cda_table_importer(use_cache=True)
###### Input parameters ########
table_importer: CdaTableImporter = configure_cda_table_importer(use_cache=False)

#Tsite = Q('primary_diagnosis_site = "%lung%" OR primary_diagnosis_site = "%pulmonary%"')
# b = {'x':42, 'y':None}
# function(1, **b) # equal to function(1, x=42, y=None)
Tsite = {'match_any': ['primary_diagnosis_site = *lung*' , 'primary_diagnosis_site = *pulmonary*']}
Query = {'match_any': ['primary_diagnosis_site = *lung*' , 'primary_diagnosis_site = *pulmonary*'],
'data_source': 'GDC'}
cohort_name = 'Lung'
p = table_importer.get_ga4gh_phenopackets(Tsite, cohort_name=cohort_name)
####################################

p = table_importer.get_ga4gh_phenopackets(Query, cohort_name=cohort_name)

result_dir = os.path.abspath(os.path.join('phenopackets', cohort_name))
os.makedirs(result_dir, exist_ok=True)
Expand Down
77,108 changes: 34,745 additions & 42,363 deletions specimen_df.txt

Large diffs are not rendered by default.

127 changes: 127 additions & 0 deletions src/oncoexporter/cda/_gdc.orig.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import json
import logging
import typing

import phenopackets as pp
import requests


class GdcMutationService:
"""
`GdcMutationService` queries Genomic Data Commons REST endpoint to fetch variants for a CDA subject.
"""

def __init__(
self,
page_size=100,
page=1,
timeout=10,
):
self._logger = logging.getLogger(__name__)
self._url = 'https://api.gdc.cancer.gov/ssms'
self._page_size = page_size
self._page = page
self._timeout = timeout
self._fields = ','.join((
# "mutation_type",
# "mutation_subtype",
"ncbi_build",
"chromosome",
"start_position",
# "end_position",
"reference_allele",
"tumor_allele",
# "genomic_dna_change",
# "end_position",
# "gene_aa_change",
"consequence.transcript.gene.gene_id",
"consequence.transcript.gene.symbol",
"consequence.transcript.transcript_id",
"consequence.transcript.annotation.hgvsc",
))

def fetch_variants(self, subject_id: str) -> typing.Sequence[pp.VariantInterpretation]:
params = self._prepare_query_params(subject_id)

response = requests.get(self._url, params=params, timeout=self._timeout)

if response.status_code == 200:
data = response.json()
mutations = data.get("data", {}).get("hits", [])

mutation_details = []
for mutation in mutations:
vi = self._map_mutation_to_variant_interpretation(mutation)
mutation_details.append(vi)

return mutation_details
else:
raise ValueError(f'Failed to fetch data due to {response.status_code}: {response.reason}')

def _prepare_query_params(self, subject_id: str):
filters = {
"op": "in",
"content": {
"field": "cases.submitter_id",
"value": [subject_id]
}
}
return {
"fields": self._fields,
"filters": json.dumps(filters),
"format": "JSON",
"size": self._page_size,
}

def _map_mutation_to_variant_interpretation(self, mutation) -> pp.VariantInterpretation:
vcf_record = self._parse_vcf_record(mutation)

vd = pp.VariationDescriptor()
vd.id = mutation['id']
if vcf_record is not None:
vd.vcf_record.CopyFrom(vcf_record)

# TODO: set gene
# TODO: 't_depth', 't_ref_count', 't_alt_count', 'n_depth', 'n_ref_count', 'n_alt_count'
# TODO: mutation status

for csq in mutation['consequence']:
expression = GdcMutationService._map_consequence_to_expression(csq)
if expression is not None:
vd.expressions.append(expression)

vd.molecule_context = pp.MoleculeContext.genomic

vi = pp.VariantInterpretation()
vi.variation_descriptor.CopyFrom(vd)
return vi

def _parse_vcf_record(self, mutation) -> typing.Optional[pp.VcfRecord]:
if mutation['reference_allele'] == '-' or mutation['tumor_allele'] == '-':
self._logger.debug(
'Cannot create a VCF record due to missing bases in the reference_allele/tumor_allele alleles: %s',
mutation)
return None

vcf_record = pp.VcfRecord()

vcf_record.genome_assembly = mutation['ncbi_build']
vcf_record.chrom = mutation['chromosome']
vcf_record.id = mutation['id']
vcf_record.pos = mutation['start_position']
vcf_record.ref = mutation['reference_allele']
vcf_record.alt = mutation['tumor_allele']

return vcf_record

@staticmethod
def _map_consequence_to_expression(csq) -> typing.Optional[pp.Expression]:
tx = csq['transcript']

expression = pp.Expression()
expression.syntax = 'hgvs.c'
tx_id = tx['transcript_id']
ann = tx['annotation']['hgvsc']
expression.value = f'{tx_id}:{ann}'

return expression
77 changes: 57 additions & 20 deletions src/oncoexporter/cda/_gdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@ def __init__(
timeout=10,
):
self._logger = logging.getLogger(__name__)
self._url = 'https://api.gdc.cancer.gov/ssms'
self._variants_url = 'https://api.gdc.cancer.gov/ssms'
self._survival_url = 'https://api.gdc.cancer.gov/analysis/survival'
self._cases = 'https://api.gdc.cancer.gov/cases'
self._page_size = page_size
self._page = page
self._timeout = timeout
self._fields = ','.join((
self._variant_fields = ','.join((
# "mutation_type",
# "mutation_subtype",
"ncbi_build",
Expand All @@ -39,26 +41,20 @@ def __init__(
"consequence.transcript.transcript_id",
"consequence.transcript.annotation.hgvsc",
))
self._case_fields = ','.join((
"demographic.vital_status",
))

def fetch_variants(self, subject_id: str) -> typing.Sequence[pp.VariantInterpretation]:
params = self._prepare_query_params(subject_id)

response = requests.get(self._url, params=params, timeout=self._timeout)

def _fetch_data_from_gdc(self, url: str, subject_id: str, fields: typing.List[str]=None) -> typing.Any:
params = self._prepare_query_params(subject_id, fields)
response = requests.get(url, params=params, timeout=self._timeout)
if response.status_code == 200:
data = response.json()
mutations = data.get("data", {}).get("hits", [])

mutation_details = []
for mutation in mutations:
vi = self._map_mutation_to_variant_interpretation(mutation)
mutation_details.append(vi)

return mutation_details
return data
else:
raise ValueError(f'Failed to fetch data due to {response.status_code}: {response.reason}')

def _prepare_query_params(self, subject_id: str):
raise ValueError(f'Failed to fetch data from {url} due to {response.status_code}: {response.reason}')
def _prepare_query_params(self, subject_id: str, fields: typing.List[str]=None) -> typing.Dict:
filters = {
"op": "in",
"content": {
Expand All @@ -67,12 +63,53 @@ def _prepare_query_params(self, subject_id: str):
}
}
return {
"fields": self._fields,
"fields": fields,
"filters": json.dumps(filters),
"format": "JSON",
"size": self._page_size,
}

def fetch_variants(self, subject_id: str) -> typing.Sequence[pp.VariantInterpretation]:
variants = self._fetch_data_from_gdc(self._variants_url, subject_id, self._variant_fields)

mutations = variants.get("data", {}).get("hits", [])

mutation_details = []
for mutation in mutations:
vi = self._map_mutation_to_variant_interpretation(mutation)
mutation_details.append(vi)

return mutation_details

def fetch_vital_status(self, subject_id: str) -> pp.VitalStatus:
survival_data = self._fetch_data_from_gdc(self._survival_url, subject_id)
vital_status_data = self._fetch_data_from_gdc(self._cases, subject_id, self._case_fields)

survival_time = None
vital_status = None

survival_results = survival_data.get("results", [])
if survival_results:
donors = survival_results[0].get("donors", [])
if donors:
survival_time = donors[0].get("time")

vital_status_hits = vital_status_data.get("data", {}).get("hits", [])
if vital_status_hits:
demographic = vital_status_hits[0].get("demographic", {})
vital_status = demographic.get("vital_status")

vital_status_obj = pp.VitalStatus()
vital_status_obj.survival_time_in_days = int(survival_time) if survival_time is not None else 0
if vital_status == "Dead":
vital_status_obj.status = pp.VitalStatus.Status.DECEASED
elif vital_status == "Alive":
vital_status_obj.status = pp.VitalStatus.Status.ALIVE
else:
vital_status_obj.status = pp.VitalStatus.Status.UNKNOWN_STATUS

return vital_status_obj

def _map_mutation_to_variant_interpretation(self, mutation) -> pp.VariantInterpretation:
vcf_record = self._parse_vcf_record(mutation)

Expand Down Expand Up @@ -124,4 +161,4 @@ def _map_consequence_to_expression(csq) -> typing.Optional[pp.Expression]:
ann = tx['annotation']['hgvsc']
expression.value = f'{tx_id}:{ann}'

return expression
return expression
14 changes: 12 additions & 2 deletions src/oncoexporter/cda/cda_table_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,11 +314,21 @@ def get_ga4gh_phenopackets(self, source: dict, **kwargs) -> typing.List[PPkt.Phe
# Note: as of 4-10-24, no way to access system/data source from result of fetch_rows (can specify in fetch_rows)
#for rsub_subj in row["subject_identifier"]:
# if rsub_subj["system"] == "GDC":
variant_interpretations = self._gdc_mutation_service.fetch_variants(row["subject_id"]) # was rsub_subj['value']

# have to strip off the leading name before first '.'
# e.g. TCGA.TCGA-05-4250 -> TCGA-05-4250
import re

subj_id = re.sub("^[^.]+\.", "", row["subject_id"])
print(row["subject_id"], subj_id)

variant_interpretations = self._gdc_mutation_service.fetch_variants(subj_id) # was rsub_subj['value']
if len(variant_interpretations) == 0:
print("No variants found")
continue

else:
print("length variant_interpretations: {}".format(len(variant_interpretations)))

# TODO: improve/enhance diagnosis term annotations
diagnosis = PPkt.Diagnosis()
diagnosis.disease.id = "NCIT:C3262"
Expand Down
Loading

0 comments on commit ac183ca

Please sign in to comment.