Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove RNA-SeQC from workflows and MultiQC #1322

Merged
merged 2 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ Added

Changed
-------
- Remove ``rnaseqc-qc`` 3' bias statistics from MultiQC report
- Remove ``rnaseqc-qc`` from RNA-seq workflows

Fixed
-----
Expand Down
46 changes: 1 addition & 45 deletions resolwe_bio/processes/support_processors/multiqc.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,6 @@ def create_summary_table(samples, species, build):
json.dump(sample_summary_json, out_file)


def parse_rnaseqc_report(report):
"""Parse RNA-SeQC QC report file."""
df = pd.read_csv(report, sep="\t")
return dict(df.values)


def parse_genebody_report(report):
"""Parse QoRTs gene body coverage metrics report file."""
df = pd.read_csv(report, sep="\t", compression="gzip")
Expand All @@ -79,37 +73,6 @@ def parse_genebody_report(report):
return dict


def create_coverage_table(sample_names, reports):
"""Prepare coverage metrics table."""
coverage_stats = [
"Genes used in 3' bias",
"Mean 3' bias",
"Median 3' bias",
"3' bias Std",
"3' bias MAD_Std",
"3' Bias, 25th Percentile",
"3' Bias, 75th Percentile",
]

coverage_qc_json = {
"id": "coverage_qc",
"section_name": "RNA-SeQC Coverage Stats",
"plot_type": "table",
"file_format": "json",
"data": {},
}

for sample_name, report in zip(sample_names, reports):
report_data = parse_rnaseqc_report(report)

coverage_qc_json["data"][sample_name] = {
k: report_data[k] for k in coverage_stats if k in report_data
}

with open("rnaseqc_coverage_mqc.json", "w") as out_file:
json.dump(coverage_qc_json, out_file)


def create_coverage_plot(sample_names, reports):
"""Prepare QoRTs gene body coverage plot."""
genebody_qc_json = {
Expand Down Expand Up @@ -474,7 +437,7 @@ class MultiQC(Process):
}
category = "QC"
data_name = "MultiQC report"
version = "1.23.0"
version = "1.24.0"
marcellevstek marked this conversation as resolved.
Show resolved Hide resolved

class Input:
"""Input fields to process MultiQC."""
Expand Down Expand Up @@ -552,8 +515,6 @@ def run(self, inputs, outputs):
unsupported_data = []
star_quantification_samples = []
star_quantification_reports = []
rnaseqc_samples = []
rnaseqc_reports = []
qorts_samples = []
qorts_reports = []

Expand Down Expand Up @@ -733,14 +694,9 @@ def run(self, inputs, outputs):

elif d.process.type == "data:rnaseqc:qc:":
name = os.path.basename(d.output.metrics.path)
rnaseqc_samples.append(sample_name)
rnaseqc_reports.append(d.output.metrics.path)
create_symlink(
src=d.output.metrics.path, dst=os.path.join(sample_dir, name)
)
create_coverage_table(
sample_names=rnaseqc_samples, reports=rnaseqc_reports
)

elif d.process.type == "data:expression:salmon:":
# Symlink files/dirs without the parent directory to
Expand Down
20 changes: 1 addition & 19 deletions resolwe_bio/processes/workflows/bbduk_salmon_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class WorkflowBbdukSalmonQc(Process):
entity = {
"type": "sample",
}
version = "4.3.1"
version = "4.4.0"
process_type = "data:workflow:rnaseq:salmon"
category = "Pipeline"

Expand Down Expand Up @@ -449,22 +449,4 @@ def run(self, inputs, outputs):
]
}

# RNA-SeQC tool is initiated only if annotation source is ENSEMBL
if inputs.annotation.output.source == "ENSEMBL":
input_rnaseqc = {
"alignment": alignment_qc,
"annotation": inputs.annotation,
"strand_detection_options": {
"stranded": "auto",
"cdna_index": inputs.salmon_index,
"n_reads": 5000000,
},
}
rnaseqc = Data.create(
process=BioProcess.get_latest(slug="rnaseqc-qc"),
input=input_rnaseqc,
name=f"RNA-SeQC QC report ({inputs.reads.name})",
)
input_multiqc["data"].append(rnaseqc)

Data.create(process=BioProcess.get_latest(slug="multiqc"), input=input_multiqc)
22 changes: 1 addition & 21 deletions resolwe_bio/processes/workflows/bbduk_star.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class WorkflowSTAR(Process):
"expression-engine": "jinja",
}
data_name = "{{ reads|name|default('?') }}"
version = "1.4.0"
version = "1.5.0"
entity = {
"type": "sample",
}
Expand Down Expand Up @@ -731,24 +731,4 @@ def run(self, inputs, outputs):
"advanced": {"dirs": True, "config": True},
}

# RNA-SeQC tool is initiated only if annotation source is ENSEMBL
if inputs.annotation.output.source == "ENSEMBL":
input_rnaseqc = {
"alignment": alignment_downsampled,
"annotation": inputs.annotation,
"strand_detection_options": {"stranded": inputs.assay_type},
}

if inputs.cdna_index:
input_rnaseqc["strand_detection_options"][
"cdna_index"
] = inputs.cdna_index

rnaseqc = Data.create(
process=BioProcess.get_latest(slug="rnaseqc-qc"),
input=input_rnaseqc,
name=f"RNA-SeQC QC report ({inputs.reads.name})",
)
input_multiqc["data"].append(rnaseqc)

Data.create(process=BioProcess.get_latest(slug="multiqc"), input=input_multiqc)
22 changes: 1 addition & 21 deletions resolwe_bio/processes/workflows/bbduk_star_featurecounts_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class WorkflowBBDukStarFcQC(Process):
entity = {
"type": "sample",
}
version = "6.2.0"
version = "6.3.0"
process_type = "data:workflow:rnaseq:featurecounts:qc"
category = "Pipeline"

Expand Down Expand Up @@ -764,24 +764,4 @@ def run(self, inputs, outputs):
"advanced": {"dirs": True, "config": True},
}

# RNA-SeQC tool is initiated only if annotation source is ENSEMBL
if inputs.annotation.output.source == "ENSEMBL":
input_rnaseqc = {
"alignment": alignment_downsampled,
"annotation": inputs.annotation,
"strand_detection_options": {"stranded": inputs.assay_type},
}

if inputs.cdna_index:
input_rnaseqc["strand_detection_options"][
"cdna_index"
] = inputs.cdna_index

rnaseqc = Data.create(
process=BioProcess.get_latest(slug="rnaseqc-qc"),
input=input_rnaseqc,
name=f"RNA-SeQC QC report ({inputs.reads.name})",
)
input_multiqc["data"].append(rnaseqc)

Data.create(process=BioProcess.get_latest(slug="multiqc"), input=input_multiqc)
9 changes: 0 additions & 9 deletions resolwe_bio/tests/processes/test_support_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,14 +491,6 @@ def test_multiqc(self):
},
)

rnaseqc_report = self.run_process(
"rnaseqc-qc",
{
"alignment": star_alignment.id,
"annotation": annotation.id,
},
)

# BED file is not part of a sample entity. Test if MultiQC process
# correctly skips this input data object
bed = self.run_process(
Expand All @@ -518,7 +510,6 @@ def test_multiqc(self):
star_quantification.id,
samtools_idxstats.id,
qorts_report.id,
rnaseqc_report.id,
bed.id,
],
"advanced": {
Expand Down