Skip to content

Commit

Permalink
Remove sample annotation functionality
Browse files Browse the repository at this point in the history
from ``geo-import`` process
  • Loading branch information
marcellevstek committed Apr 25, 2024
1 parent 6ea1c05 commit ced98c4
Show file tree
Hide file tree
Showing 3 changed files with 2 additions and 270 deletions.
1 change: 1 addition & 0 deletions docs/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Changed
- Rewrite ``expression-aggregator`` process to Python and make it
compatible with the new annotation model
- Use GRCh38.109 genome annotation version in snpEff processes
- Remove sample annotation functionality from ``geo-import`` process

Fixed
-----
Expand Down
147 changes: 1 addition & 146 deletions resolwe_bio/processes/workflows/geo_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,141 +46,6 @@ def create_metadata(gse, run_info):
return metadata.set_index(["Sample name"], drop=False)


def construct_annotation(metadata, sample_name):
"""Construct sample annotations from metadata.
Dictionary with GEO metadata that matches the sample annotation
schema is created. Attributes under general that have no
predetermined choices are matched with our naming if they
exist in the metadata. Other fields with choices and the
experimental section are filled separately.
"""

sample_metadata = metadata.loc[sample_name]
sample_metadata = sample_metadata.fillna("")
annotation = {}

species = [
"Caenorhabditis elegans",
"Cricetulus griseus",
"Dictyostelium discoideum",
"Dictyostelium purpureum",
"Drosophila melanogaster",
"Homo sapiens",
"Macaca mulatta",
"Mus musculus",
"Odocoileus virginianus texanus",
"Rattus norvegicus",
"Solanum tuberosum",
]
molecule_choices = [
"total_rna",
"polya_rna",
"cytoplasmic_rna",
"nuclear_rna",
"genomic_dna",
"protein",
"other",
]

assay_types = [
"rna-seq",
"chip-seq",
"atac-seq",
"chipmentation",
"dna-seq",
"nanostring",
"microarray",
"edge-seq",
"other",
]

platform_types = [
"nextseq_500",
"nextseq_550",
"nextseq_550_dx",
"nextseq_1000",
"nextseq_2000",
"hiseq_2500",
"hiseq_2000",
"novaseq_6000",
"novaseq_6000_dx",
"novaseq_x",
"iseq_100",
"miniseq",
"miseq",
"miseq_dx",
"other",
]

if (
"organism_ch1" in metadata.columns
and sample_metadata["organism_ch1"] in species
):
annotation["general.species"] = sample_metadata["organism_ch1"]

if "contact_name" in metadata.columns:
annotation["general.annotator"] = sample_metadata["contact_name"].replace(
",,", " "
)

if "description" in metadata.columns:
annotation["general.description"] = sample_metadata["description"]

if "cell line" in metadata.columns:
annotation["biospecimen_information.experimental_model"] = "cell_line"
annotation["cell_line_information.cell_line_name"] = sample_metadata[
"cell line"
]
elif "tissue" in metadata.columns:
annotation["biospecimen_information.experimental_model"] = "tissue"

if "source_name_ch1" in metadata.columns:
annotation["biospecimen_information.source"] = sample_metadata[
"source_name_ch1"
]

if "cell type" in metadata.columns:
annotation["cell_line_information.cell_type"] = sample_metadata["cell type"]

if "treatment_protocol_ch1" in metadata.columns:
annotation["cell_line_information.treatment_protocol"] = sample_metadata[
"treatment_protocol_ch1"
]

if "library_strategy" in metadata.columns:
formated_assay = sample_metadata["library_strategy"].lower().replace(" ", "-")
if formated_assay in assay_types:
annotation["sample_details.assay_type"] = formated_assay

if "extract_protocol_ch1" in metadata.columns:
annotation["sample_details.extract_protocol"] = sample_metadata[
"extract_protocol_ch1"
]

if "growth_protocol_ch1" in metadata.columns:
annotation["sample_details.growth_protocol"] = sample_metadata[
"growth_protocol_ch1"
]

if "molecule_ch1" in metadata.columns:
formated_molecule = sample_metadata["molecule_ch1"].lower().replace(" ", "_")
if formated_molecule in molecule_choices:
annotation["sample_details.library_type"] = formated_molecule

if "instrument_model" in metadata.columns:
formated_platform = (
sample_metadata["instrument_model"]
.replace("Illumina ", "")
.lower()
.replace(" ", "_")
)
if formated_platform in platform_types:
annotation["sample_details.platform"] = formated_platform

return annotation


class GeoImport(Process):
"""Import all runs from a GEO Series.
Expand Down Expand Up @@ -224,7 +89,7 @@ class GeoImport(Process):
},
}
data_name = "{{ gse_accession }}"
version = "2.7.2"
version = "2.8.0"
process_type = "data:geo"
category = "Import"
scheduling_class = SchedulingClass.BATCH
Expand Down Expand Up @@ -510,13 +375,3 @@ def run(self, inputs, outputs):
metadata = pd.concat(metadata_tables.values(), join="outer", ignore_index=False)
metadata.to_csv(meta_file, sep="\t", index=False)
self.run_process("upload-metadata-unique", {"src": meta_file})

for entity_name in metadata["Sample name"].values:
objects = Data.filter(entity__name=entity_name)
if len(objects) > 1:
self.warning(
f"Multiple samples with entity name {entity_name} are present, annotation will be added only "
"to the last one"
)
obj = objects[-1]
obj.entity.annotations = construct_annotation(metadata, obj.entity_name)
124 changes: 0 additions & 124 deletions resolwe_bio/tests/workflows/test_geo_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,39 +165,6 @@ def test_dss_geo(self):

self.assertEqual(sample.annotations.count(), 12)

self.assertAnnotation(sample, "general.species", "Homo sapiens")

self.assertAnnotation(sample, "general.annotator", "Lin He")
self.assertAnnotation(sample, "general.description", "ING5 knockdown")

self.assertAnnotation(
sample, "biospecimen_information.experimental_model", "cell_line"
)
self.assertAnnotation(sample, "biospecimen_information.source", "HepG2 cells")

self.assertAnnotation(sample, "cell_line_information.cell_line_name", "HepG2")
self.assertAnnotation(
sample,
"cell_line_information.treatment_protocol",
(
"HepG2 cells were transfected with vector or FLAG-JFK or treated with control "
"siRNA or ING5 siRNA."
),
)

self.assertAnnotation(sample, "sample_details.assay_type", "rna-seq")
self.assertAnnotation(
sample,
"sample_details.extract_protocol",
(
"Total mRNAs were isolated with Trizol reagents (Invitrogen) for cDNA synthesis, "
"library construction, and sequencing using HiSeq 2500. RNA libraries were "
"prepared for sequencing using standard Illumina protocols."
),
)
self.assertAnnotation(sample, "sample_details.library_type", "total_rna")
self.assertAnnotation(sample, "sample_details.platform", "hiseq_2000")

# Non-existant GSE series.
wrong = self.run_process(
"geo-import", {"gse_accession": "GSE99999999"}, Data.STATUS_ERROR
Expand Down Expand Up @@ -327,47 +294,6 @@ def test_geo_chipseq(self):

self.assertEqual(sample.annotations.count(), 11)

# general
self.assertAnnotation(sample, "general.species", "Homo sapiens")
self.assertAnnotation(sample, "general.annotator", "Matthew Weirauch")
self.assertAnnotation(
sample, "general.description", "ChIP_EBNA2_GM12878_rep2-E00457"
)

# biospecimen_information
self.assertAnnotation(
sample, "biospecimen_information.experimental_model", "cell_line"
)
self.assertAnnotation(
sample, "biospecimen_information.source", "GM12878 (B-Lymphocyte) LCL"
)

# cell_line_information
self.assertAnnotation(
sample, "cell_line_information.cell_line_name", "GM12878 (B-Lymphocyte) LCL"
)
self.assertAnnotation(sample, "cell_line_information.treatment_protocol", "")

# sample_details
self.assertAnnotation(sample, "sample_details.assay_type", "chip-seq")
self.assertAnnotation(
sample,
"sample_details.extract_protocol",
(
"Cells were crosslinked and nuclei were sonicated as described previously "
"(Lu et al. 2015). Libraries were prepared via ChIPmentation (Schmidl et al. "
"2015)."
),
)
self.assertAnnotation(
sample,
"sample_details.growth_protocol",
(
"Cells were cultured in 10% FBS supplemented RPMI 1640 medium for 2 weeks."
),
)
self.assertAnnotation(sample, "sample_details.library_type", "genomic_dna")

@with_resolwe_host
@tag_process("geo-import")
def test_geo_ena(self):
Expand Down Expand Up @@ -418,53 +344,3 @@ def test_geo_ena(self):
sample = sra.entity

self.assertEqual(sample.annotations.count(), 7)

# general
self.assertAnnotation(sample, "general.annotator", "ArrayExpress EBI")
self.assertAnnotation(
sample, "general.description", "Provider: EMBL_Heidelberg"
)

# biospecimen_information
self.assertAnnotation(
sample,
"biospecimen_information.source",
"E_coli_K12_MG1655_RNAseq_LB_Transition_to_stationary",
)

# sample_details
self.assertAnnotation(sample, "sample_details.assay_type", "rna-seq")

self.assertAnnotation(
sample,
"sample_details.extract_protocol",
(
"nucleic_acid_extraction | To prepare cells for RNA "
"extraction, 100 ml of fresh LB was inoculated 1:200 from an overnight culture "
"in a 250 ml flask and incubated with shaking at 180 r.p.m. in a New Brunswick "
"C76 waterbath at 37C. Two biological replicates were performed for each strain "
"and samples were taken at early-exponential, mid-exponential, "
"transition-to-stationary and stationary phase. The cells were pelleted by "
"centrifugation (10000 g, 10 min, 4¡C), washed in 1xPBS and pellets were "
"snap-frozen and stored at -80C until required. RNA was extracted using Trizol "
"Reagent (Invitrogen) according to the manufacturer's protocol until the "
"chloroform extraction step. The aqueous phase was then loaded onto mirVanaTM "
"miRNA Isolation kit (Ambion Inc.) columns and washed according to the "
"manufacturer's protocol. Total RNA was eluted in 50µl of RNAase free water. "
"The concentration was then determined using a Nanodrop ND-1000 machine (NanoDrop "
"Technologies), and RNA quality was tested by visualization on agarose gels and "
"by Agilent 2100 Bioanalyser (Agilent Technologies). sequencing | Standard "
"Illumina protocol for cDNA sequencing"
),
)
self.assertAnnotation(
sample,
"sample_details.growth_protocol",
(
"grow | The E. coli K-12 MG1655 bacterial strains used in this "
"work are the following: E. coli MG1655 (F- lambda- ilvG- rfb-50 rph-1). "
"Luria-Bertani (0.5% NaCl) broth and agar (15 g/liter) were used for routine "
"growth."
),
)
self.assertAnnotation(sample, "sample_details.library_type", "total_rna")

0 comments on commit ced98c4

Please sign in to comment.