Skip to content

Commit

Permalink
Merge pull request #435 from nf-core/fix_taxonomy_files
Browse files Browse the repository at this point in the history
Fix taxonomy merging scripts and ouput
  • Loading branch information
Darcy220606 authored Jan 15, 2025
2 parents 640992d + eec7738 commit b89ec1b
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 43 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- [#427](https://github.com/nf-core/funcscan/pull/427) Fixed the AMP reference database issues reported by users, due to non-ASCII characters. (by @darcy220606)
- [#430](https://github.com/nf-core/funcscan/pull/430) Updated `rgi/main` module to fix incorrect variable name. (by @amizeranschi and @jasmezz)
- [#435](https://github.com/nf-core/funcscan/pull/435) Fixed dependency errors within taxonomy merging scripts, updated the code and output for all three workflows. Bumped to version 0.1.1. (by @darcy220606)

### `Dependencies`

Expand Down
136 changes: 97 additions & 39 deletions bin/merge_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Written by Anan Ibrahim and released under the MIT license.
# See git repository (https://github.com/Darcy220606/AMPcombi) for full license text.
# Date: March 2024
# Version: 0.1.0
# Version: 0.1.1

# Required modules
import sys
Expand All @@ -12,7 +12,7 @@
import numpy as np
import argparse

tool_version = "0.1.0"
tool_version = "0.1.1"
#########################################
# TOP LEVEL: AMPCOMBI
#########################################
Expand Down Expand Up @@ -66,6 +66,15 @@
# TAXONOMY
#########################################
def reformat_mmseqs_taxonomy(mmseqs_taxonomy):
"""_summary_
Reformats the taxonomy files and joins them in a list to be passed on to the tools functions
Args:
mmseqs_taxonomy (tsv): mmseqs output file per sample
Returns:
data frame: reformatted tables
"""
mmseqs2_df = pd.read_csv(mmseqs_taxonomy, sep='\t', header=None, names=['contig_id', 'taxid', 'rank_label', 'scientific_name', 'lineage', 'mmseqs_lineage_contig'])
# remove the lineage column
mmseqs2_df.drop('lineage', axis=1, inplace=True)
Expand All @@ -85,7 +94,19 @@ def reformat_mmseqs_taxonomy(mmseqs_taxonomy):
# FUNCTION: AMPCOMBI
#########################################
def ampcombi_taxa(args):
merged_df = pd.DataFrame()
"""_summary_
Merges AMPcombi tool output with taxonomy information.
Parameters:
----------
args:
Contains arguments for AMPcombi file path (`amp`) and list of taxonomy file paths (`taxa1`).
Outputs:
-------
Creates a file named `ampcombi_complete_summary_taxonomy.tsv` containing the merged results.
"""
combined_dfs = []

# assign input args to variables
ampcombi = args.amp
Expand All @@ -100,31 +121,25 @@ def ampcombi_taxa(args):

# filter the tool df
tool_df = pd.read_csv(ampcombi, sep='\t')
# remove the column with contig_id - duplicate #NOTE: will be fixed in AMPcombi v2.0.0
tool_df = tool_df.drop('contig_id', axis=1)
# make sure 1st and 2nd column have the same column labels
tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True)
tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True)
# grab the real contig id in another column copy for merging
tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0]

# merge rows from taxa to ampcombi_df based on substring match in sample_id
# grab the unique sample names from the taxonomy table
samples_taxa = taxa_df['sample_id'].unique()
# for every sampleID in taxadf merge the results
for sampleID in samples_taxa:
# subset ampcombi
subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)]
subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)]
# subset taxa
subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)]
subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)]
# merge
subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left')
subset_df = pd.merge(subset_tool, subset_taxa, on='contig_id', how='left')
# cleanup the table
columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y']
columnsremove = ['sample_id_y']
subset_df.drop(columnsremove, axis=1, inplace=True)
subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True)
subset_df.rename(columns={'sample_id_x':'sample_id'},inplace=True)
# append in the combined_df
merged_df = merged_df.append(subset_df, ignore_index=True)
combined_dfs.append(subset_df)
merged_df = pd.concat(combined_dfs, ignore_index=True)

# write to file
merged_df.to_csv('ampcombi_complete_summary_taxonomy.tsv', sep='\t', index=False)
Expand All @@ -133,7 +148,20 @@ def ampcombi_taxa(args):
# FUNCTION: COMBGC
#########################################
def combgc_taxa(args):
merged_df = pd.DataFrame()
"""_summary_
Merges comBGC tool output with taxonomy information.
Parameters:
----------
args:
Contains arguments for comBGC file path (`bgc`) and list of taxonomy file paths (`taxa2`).
Outputs:
-------
Creates a file named `combgc_complete_summary_taxonomy.tsv` containing the merged results.
"""
combined_dfs = []

# assign input args to variables
combgc = args.bgc
Expand All @@ -152,23 +180,24 @@ def combgc_taxa(args):
tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True)
tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True)

# merge rows from taxa to ampcombi_df based on substring match in sample_id
# merge rows from taxa to combgc_df based on substring match in sample_id
# grab the unique sample names from the taxonomy table
samples_taxa = taxa_df['sample_id'].unique()
# for every sampleID in taxadf merge the results
for sampleID in samples_taxa:
# subset ampcombi
subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)]
# subset tool
subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)]
# subset taxa
subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)]
subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)]
# merge
subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id', right_on='contig_id', how='left')
subset_df = pd.merge(subset_tool, subset_taxa, on='contig_id', how='left')
# cleanup the table
columnsremove = ['sample_id_y']
subset_df.drop(columnsremove, axis=1, inplace=True)
subset_df.rename(columns={'sample_id_x':'sample_id'},inplace=True)
# append in the combined_df
merged_df = merged_df.append(subset_df, ignore_index=True)
combined_dfs.append(subset_df)
merged_df = pd.concat(combined_dfs, ignore_index=True)

# write to file
merged_df.to_csv('combgc_complete_summary_taxonomy.tsv', sep='\t', index=False)
Expand All @@ -177,7 +206,19 @@ def combgc_taxa(args):
# FUNCTION: HAMRONIZATION
#########################################
def hamronization_taxa(args):
merged_df = pd.DataFrame()
"""_summary_
Merges hAMRonization tool output with taxonomy information.
Parameters:
----------
args:
Contains arguments for hamronization file path (`arg`) and list of taxonomy file paths (`taxa2`).
Outputs:
-------
Creates a file named `hamronization_complete_summary_taxonomy.tsv` containing the merged results.
"""
combined_dfs = []

# assign input args to variables
hamronization = args.arg
Expand All @@ -197,29 +238,46 @@ def hamronization_taxa(args):
# reorder the columns
new_order = ['sample_id', 'contig_id'] + [col for col in tool_df.columns if col not in ['sample_id', 'contig_id']]
tool_df = tool_df.reindex(columns=new_order)
# grab the real contig id in another column copy for merging
tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0]

# merge rows from taxa to ampcombi_df based on substring match in sample_id
# merge rows from taxa to hamronization_df based on substring match in sample_id
# grab the unique sample names from the taxonomy table
samples_taxa = taxa_df['sample_id'].unique()
# for every sampleID in taxadf merge the results
for sampleID in samples_taxa:
# subset ampcombi
subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)]
# subset tool
subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)]
# subset taxa
subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)]
# merge
subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left')
# cleanup the table
columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y']
subset_df.drop(columnsremove, axis=1, inplace=True)
subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True)
# append in the combined_df
merged_df = merged_df.append(subset_df, ignore_index=True)
subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)]
# ensure strings
subset_tool['contig_id'] = subset_tool['contig_id'].astype(str)
subset_taxa['contig_id'] = subset_taxa['contig_id'].astype(str)
# rename columns to avoid dropping of mutual ones
rename_dict = {col: f"{col}_taxa" for col in subset_taxa.columns if col in subset_tool.columns}
subset_taxa = subset_taxa.rename(columns=rename_dict)

# merge by string
merged_rows = []
# iterate and find all matches
for _, tool_row in subset_tool.iterrows():
tool_contig_id = tool_row['contig_id']
matches = subset_taxa[subset_taxa['contig_id_taxa'].apply(lambda x: str(x) in tool_contig_id)]
# if match, merge row
if not matches.empty:
for _, taxa_row in matches.iterrows():
merged_row = {**tool_row.to_dict(), **taxa_row.to_dict()}
merged_rows.append(merged_row)
else:
# if no match keep row as is
merged_row = {**tool_row.to_dict()}
merged_rows.append(merged_row)

merged_df = pd.DataFrame(merged_rows)
combined_dfs.append(merged_df)

merged_df_final = pd.concat(combined_dfs, ignore_index=True)

# write to file
merged_df.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False)
merged_df_final.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False)

#########################################
# SUBPARSERS: DEFAULT
Expand Down
1 change: 1 addition & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,7 @@ process {
}

withName: ARG_TABIX_BGZIP {
ext.prefix = { "hamronization_complete_summary_taxonomy" }
publishDir = [
path: { "${params.outdir}/reports/hamronization_summarize" },
mode: params.publish_dir_mode,
Expand Down
2 changes: 1 addition & 1 deletion docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation
- `hamronization_summarize/` one of the following:
- `hamronization_combined_report.json`: summarised output in .json format
- `hamronization_combined_report.tsv`: summarised output in .tsv format when the taxonomic classification is turned off (pipeline default).
- `hamronization_combined_report.tsv.gz`: summarised output in gzipped format when the taxonomic classification is turned on by `--run_taxa_classification`.
- `hamronization_complete_summary_taxonomy.tsv.gz`: summarised output in gzipped format when the taxonomic classification is turned on by `--run_taxa_classification`.
- `hamronization_combined_report.html`: interactive output in .html format

</details>
Expand Down
2 changes: 1 addition & 1 deletion tests/test_taxonomy_bakta.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ nextflow_pipeline {
).match("fargene") },

// hAMRonization
{ assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() },
{ assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() },

// antiSMASH
{ assert snapshot (
Expand Down
2 changes: 1 addition & 1 deletion tests/test_taxonomy_prokka.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ nextflow_pipeline {
).match("fargene") },

// hAMRonization
{ assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() },
{ assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() },

// antiSMASH
{ assert snapshot (
Expand Down
2 changes: 1 addition & 1 deletion tests/test_taxonomy_pyrodigal.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ nextflow_pipeline {
).match("fargene") },

// hAMRonization
{ assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() },
{ assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() },

// antiSMASH
{ assert snapshot (
Expand Down

0 comments on commit b89ec1b

Please sign in to comment.