diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index b91ef707e..3995b6687 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -20,6 +20,7 @@ Changed - Rewrite ``expression-aggregator`` process to Python and make it compatible with the new annotation model - Use GRCh38.109 genome annotation version in snpEff processes +- Remove sample annotation functionality from ``geo-import`` process Fixed ----- diff --git a/resolwe_bio/processes/workflows/geo_import.py b/resolwe_bio/processes/workflows/geo_import.py index 2e74de1c3..10a425d1d 100644 --- a/resolwe_bio/processes/workflows/geo_import.py +++ b/resolwe_bio/processes/workflows/geo_import.py @@ -46,141 +46,6 @@ def create_metadata(gse, run_info): return metadata.set_index(["Sample name"], drop=False) -def construct_annotation(metadata, sample_name): - """Construct sample annotations from metadata. - - Dictionary with GEO metadata that matches the sample annotation - schema is created. Attributes under general that have no - predetermined choices are matched with our naming if they - exist in the metadata. Other fields with choices and the - experimental section are filled separately. - """ - - sample_metadata = metadata.loc[sample_name] - sample_metadata = sample_metadata.fillna("") - annotation = {} - - species = [ - "Caenorhabditis elegans", - "Cricetulus griseus", - "Dictyostelium discoideum", - "Dictyostelium purpureum", - "Drosophila melanogaster", - "Homo sapiens", - "Macaca mulatta", - "Mus musculus", - "Odocoileus virginianus texanus", - "Rattus norvegicus", - "Solanum tuberosum", - ] - molecule_choices = [ - "total_rna", - "polya_rna", - "cytoplasmic_rna", - "nuclear_rna", - "genomic_dna", - "protein", - "other", - ] - - assay_types = [ - "rna-seq", - "chip-seq", - "atac-seq", - "chipmentation", - "dna-seq", - "nanostring", - "microarray", - "edge-seq", - "other", - ] - - platform_types = [ - "nextseq_500", - "nextseq_550", - "nextseq_550_dx", - "nextseq_1000", - "nextseq_2000", - "hiseq_2500", - "hiseq_2000", - "novaseq_6000", - "novaseq_6000_dx", - "novaseq_x", - "iseq_100", - "miniseq", - "miseq", - "miseq_dx", - "other", - ] - - if ( - "organism_ch1" in metadata.columns - and sample_metadata["organism_ch1"] in species - ): - annotation["general.species"] = sample_metadata["organism_ch1"] - - if "contact_name" in metadata.columns: - annotation["general.annotator"] = sample_metadata["contact_name"].replace( - ",,", " " - ) - - if "description" in metadata.columns: - annotation["general.description"] = sample_metadata["description"] - - if "cell line" in metadata.columns: - annotation["biospecimen_information.experimental_model"] = "cell_line" - annotation["cell_line_information.cell_line_name"] = sample_metadata[ - "cell line" - ] - elif "tissue" in metadata.columns: - annotation["biospecimen_information.experimental_model"] = "tissue" - - if "source_name_ch1" in metadata.columns: - annotation["biospecimen_information.source"] = sample_metadata[ - "source_name_ch1" - ] - - if "cell type" in metadata.columns: - annotation["cell_line_information.cell_type"] = sample_metadata["cell type"] - - if "treatment_protocol_ch1" in metadata.columns: - annotation["cell_line_information.treatment_protocol"] = sample_metadata[ - "treatment_protocol_ch1" - ] - - if "library_strategy" in metadata.columns: - formated_assay = sample_metadata["library_strategy"].lower().replace(" ", "-") - if formated_assay in assay_types: - annotation["sample_details.assay_type"] = formated_assay - - if "extract_protocol_ch1" in metadata.columns: - annotation["sample_details.extract_protocol"] = sample_metadata[ - "extract_protocol_ch1" - ] - - if "growth_protocol_ch1" in metadata.columns: - annotation["sample_details.growth_protocol"] = sample_metadata[ - "growth_protocol_ch1" - ] - - if "molecule_ch1" in metadata.columns: - formated_molecule = sample_metadata["molecule_ch1"].lower().replace(" ", "_") - if formated_molecule in molecule_choices: - annotation["sample_details.library_type"] = formated_molecule - - if "instrument_model" in metadata.columns: - formated_platform = ( - sample_metadata["instrument_model"] - .replace("Illumina ", "") - .lower() - .replace(" ", "_") - ) - if formated_platform in platform_types: - annotation["sample_details.platform"] = formated_platform - - return annotation - - class GeoImport(Process): """Import all runs from a GEO Series. @@ -224,7 +89,7 @@ class GeoImport(Process): }, } data_name = "{{ gse_accession }}" - version = "2.7.2" + version = "2.8.0" process_type = "data:geo" category = "Import" scheduling_class = SchedulingClass.BATCH @@ -510,13 +375,3 @@ def run(self, inputs, outputs): metadata = pd.concat(metadata_tables.values(), join="outer", ignore_index=False) metadata.to_csv(meta_file, sep="\t", index=False) self.run_process("upload-metadata-unique", {"src": meta_file}) - - for entity_name in metadata["Sample name"].values: - objects = Data.filter(entity__name=entity_name) - if len(objects) > 1: - self.warning( - f"Multiple samples with entity name {entity_name} are present, annotation will be added only " - "to the last one" - ) - obj = objects[-1] - obj.entity.annotations = construct_annotation(metadata, obj.entity_name) diff --git a/resolwe_bio/tests/workflows/test_geo_import.py b/resolwe_bio/tests/workflows/test_geo_import.py index 18f23043d..92459750b 100644 --- a/resolwe_bio/tests/workflows/test_geo_import.py +++ b/resolwe_bio/tests/workflows/test_geo_import.py @@ -165,39 +165,6 @@ def test_dss_geo(self): self.assertEqual(sample.annotations.count(), 12) - self.assertAnnotation(sample, "general.species", "Homo sapiens") - - self.assertAnnotation(sample, "general.annotator", "Lin He") - self.assertAnnotation(sample, "general.description", "ING5 knockdown") - - self.assertAnnotation( - sample, "biospecimen_information.experimental_model", "cell_line" - ) - self.assertAnnotation(sample, "biospecimen_information.source", "HepG2 cells") - - self.assertAnnotation(sample, "cell_line_information.cell_line_name", "HepG2") - self.assertAnnotation( - sample, - "cell_line_information.treatment_protocol", - ( - "HepG2 cells were transfected with vector or FLAG-JFK or treated with control " - "siRNA or ING5 siRNA." - ), - ) - - self.assertAnnotation(sample, "sample_details.assay_type", "rna-seq") - self.assertAnnotation( - sample, - "sample_details.extract_protocol", - ( - "Total mRNAs were isolated with Trizol reagents (Invitrogen) for cDNA synthesis, " - "library construction, and sequencing using HiSeq 2500. RNA libraries were " - "prepared for sequencing using standard Illumina protocols." - ), - ) - self.assertAnnotation(sample, "sample_details.library_type", "total_rna") - self.assertAnnotation(sample, "sample_details.platform", "hiseq_2000") - # Non-existant GSE series. wrong = self.run_process( "geo-import", {"gse_accession": "GSE99999999"}, Data.STATUS_ERROR @@ -327,47 +294,6 @@ def test_geo_chipseq(self): self.assertEqual(sample.annotations.count(), 11) - # general - self.assertAnnotation(sample, "general.species", "Homo sapiens") - self.assertAnnotation(sample, "general.annotator", "Matthew Weirauch") - self.assertAnnotation( - sample, "general.description", "ChIP_EBNA2_GM12878_rep2-E00457" - ) - - # biospecimen_information - self.assertAnnotation( - sample, "biospecimen_information.experimental_model", "cell_line" - ) - self.assertAnnotation( - sample, "biospecimen_information.source", "GM12878 (B-Lymphocyte) LCL" - ) - - # cell_line_information - self.assertAnnotation( - sample, "cell_line_information.cell_line_name", "GM12878 (B-Lymphocyte) LCL" - ) - self.assertAnnotation(sample, "cell_line_information.treatment_protocol", "") - - # sample_details - self.assertAnnotation(sample, "sample_details.assay_type", "chip-seq") - self.assertAnnotation( - sample, - "sample_details.extract_protocol", - ( - "Cells were crosslinked and nuclei were sonicated as described previously " - "(Lu et al. 2015). Libraries were prepared via ChIPmentation (Schmidl et al. " - "2015)." - ), - ) - self.assertAnnotation( - sample, - "sample_details.growth_protocol", - ( - "Cells were cultured in 10% FBS supplemented RPMI 1640 medium for 2 weeks." - ), - ) - self.assertAnnotation(sample, "sample_details.library_type", "genomic_dna") - @with_resolwe_host @tag_process("geo-import") def test_geo_ena(self): @@ -418,53 +344,3 @@ def test_geo_ena(self): sample = sra.entity self.assertEqual(sample.annotations.count(), 7) - - # general - self.assertAnnotation(sample, "general.annotator", "ArrayExpress EBI") - self.assertAnnotation( - sample, "general.description", "Provider: EMBL_Heidelberg" - ) - - # biospecimen_information - self.assertAnnotation( - sample, - "biospecimen_information.source", - "E_coli_K12_MG1655_RNAseq_LB_Transition_to_stationary", - ) - - # sample_details - self.assertAnnotation(sample, "sample_details.assay_type", "rna-seq") - - self.assertAnnotation( - sample, - "sample_details.extract_protocol", - ( - "nucleic_acid_extraction | To prepare cells for RNA " - "extraction, 100 ml of fresh LB was inoculated 1:200 from an overnight culture " - "in a 250 ml flask and incubated with shaking at 180 r.p.m. in a New Brunswick " - "C76 waterbath at 37C. Two biological replicates were performed for each strain " - "and samples were taken at early-exponential, mid-exponential, " - "transition-to-stationary and stationary phase. The cells were pelleted by " - "centrifugation (10000 g, 10 min, 4¡C), washed in 1xPBS and pellets were " - "snap-frozen and stored at -80C until required. RNA was extracted using Trizol " - "Reagent (Invitrogen) according to the manufacturer's protocol until the " - "chloroform extraction step. The aqueous phase was then loaded onto mirVanaTM " - "miRNA Isolation kit (Ambion Inc.) columns and washed according to the " - "manufacturer's protocol. Total RNA was eluted in 50µl of RNAase free water. " - "The concentration was then determined using a Nanodrop ND-1000 machine (NanoDrop " - "Technologies), and RNA quality was tested by visualization on agarose gels and " - "by Agilent 2100 Bioanalyser (Agilent Technologies). sequencing | Standard " - "Illumina protocol for cDNA sequencing" - ), - ) - self.assertAnnotation( - sample, - "sample_details.growth_protocol", - ( - "grow | The E. coli K-12 MG1655 bacterial strains used in this " - "work are the following: E. coli MG1655 (F- lambda- ilvG- rfb-50 rph-1). " - "Luria-Bertani (0.5% NaCl) broth and agar (15 g/liter) were used for routine " - "growth." - ), - ) - self.assertAnnotation(sample, "sample_details.library_type", "total_rna")