-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapscale_sintax.py
executable file
·279 lines (242 loc) · 9.2 KB
/
apscale_sintax.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
#!/usr/bin/env python3
"""
A submodule for the abscale wrapper to run SINTAX on generated FASTA files.
By Chris Hempel (christopher.hempel@kaust.edu.sa) on 29 Apr 2024
"""
import datetime
import argparse
import warnings
import subprocess
import os
import re
import pandas as pd
# Define that warnings are not printed to console
warnings.filterwarnings("ignore")
# Funtion to print datetime and text
def time_print(text):
print(datetime.datetime.now().strftime("%H:%M:%S"), ": ", text, sep="")
# Function to replace unknown taxonomy in the PR2 database
def replace_pr2_tax(taxon_series):
return taxon_series.apply(
lambda taxon: (
"Unknown in PR2 database"
if taxon.endswith("X") or taxon.endswith("sp.") or taxon.endswith("_sp")
else taxon
)
)
# Function to add suffix to filename despite format
def add_suffix(filename, suffix="-no_cutoff"):
root, extension = os.path.splitext(filename)
return f"{root}{suffix}{extension}"
# Function to get lowest identified taxon and rank per row
def lowest_taxon_and_rank(row):
exceptions = [
"Taxonomy unreliable - confidence threshold not met",
"Unknown in PR2 database",
"Unknown in BOLD database",
"Unknown in SILVA database",
"Unknown in MIDORI2 database",
]
lowest_taxon = "Taxonomy unreliable"
lowest_rank = "Taxonomy unreliable"
# Iterate over columns in reverse order
for col in reversed(ranks):
value = re.sub(r"\(\d\.\d{2}\)", "", row[col])
if value == "No match in database":
lowest_taxon = value
lowest_rank = value
break
elif pd.notna(value) and value not in exceptions:
lowest_taxon = value
lowest_rank = col
break # Break on the first valid entry found
return pd.Series([lowest_taxon, lowest_rank], index=["lowest_taxon", "lowest_rank"])
# Define function to process tax dfs
def post_processing(df):
# Remove underscores
df[ranks] = df[ranks].replace(r"_", " ", regex=True)
# Make a df without confidence values (for df_no_cutoff)
df_ranks = df[ranks].replace(r"\(\d\.\d{2}\)", "", regex=True)
lowest_columns = df_ranks.apply(lowest_taxon_and_rank, axis=1)
df = pd.concat([df, lowest_columns], axis=1)
return df
# Function to replace unknown MIDORI2 ranks
def replace_if_match(taxon):
if bool(re.search(r"(phylum|class|order|family|genus|species)_", taxon)):
return "Unknown in MIDORI2 database"
return taxon
def tax_formatting(df, tax_col, ranks):
# Taxonomy formatting
if args.database_format == "silva":
# Annotate entries with no DB match
df = df.fillna(",".join(["No match in database" for _ in range(len(ranks))]))
# Split the tax_col column by comma and expand into new columns
df[ranks] = df[tax_col].str.split(",", expand=True)
# Only keep desired columns and ranks and fill missing values
df = df.drop([tax_col], axis=1).fillna(
"Taxonomy unreliable - confidence threshold not met"
)
# Replace taxa containing Not_available
df[ranks] = df[ranks].replace("Not_available", "Unknown in SILVA database")
elif args.database_format == "midori2":
# Annotate entries with no DB match
df = df.fillna(",".join(["No match in database" for _ in range(len(ranks))]))
# Split the tax_col column by comma and expand into new columns
df[ranks] = df[tax_col].str.split(",", expand=True)
# Fill missing values with threshold note
df[ranks] = df[ranks].fillna(
"Taxonomy unreliable - confidence threshold not met"
)
# If any taxon starts with "phylum", "class", "order", "family", "genus", or "species" followed by _, replace the taxa with "Unknown in MIDORI2 database"
for rank in ranks:
df[rank] = df[rank].apply(replace_if_match)
# Only keep desired columns and ranks
df = df.drop([tax_col], axis=1)
elif args.database_format == "bold":
# Annotate entries with no DB match
df = df.fillna(",".join(["No match in database" for _ in range(len(ranks))]))
# Split the tax_col column by comma and expand into new columns
df[ranks] = (
df[tax_col]
.str.split(",", expand=True)
.fillna("Taxonomy unreliable - confidence threshold not met")
)
# Replace species names containing " sp. "" or ending with "sp"
mask = (
df["species"].str.endswith("_sp")
| df["species"].str.contains("_sp\._", regex=True)
| df["species"].str.contains("_cf\._", regex=True)
| df["species"].str.endswith("_sp.")
)
df.loc[mask, "species"] = "Unknown in BOLD database"
# Replace taxa containing incertae_sedis
df[ranks] = df[ranks].applymap(
lambda x: ("Unknown in BOLD database" if "incertae_sedis" in x else x)
)
# Replace 'Unknown_in_BOLD_database' by 'Unknown in BOLD database' in the entire DataFrame
df = df.replace("Unknown_in_BOLD_database", "Unknown in BOLD database")
# Only keep desired columns and ranks and fill missing values with "NA"
df = df.drop([tax_col], axis=1)
# TO BE ADDED AT SOME POINT
# elif args.database_format == "ncbi_nt":
# # Drop rows containing "Unknown" = taxid could not be translated
# df = df[~df[ranks].apply(lambda row: row.str.contains("Unknown")).any(axis=1)]
elif args.database_format == "pr2":
# Annotate entries with no DB match
df = df.fillna(",".join(["No match in database" for _ in range(len(ranks))]))
# Split the taxonomy column by comma and expand into new columns
df[ranks] = (
df[tax_col]
.str.replace(":nucl", "")
.str.replace(":plas", "")
.str.replace(":apic", "")
.str.replace(":chrom", "")
.str.replace(":mito", "")
.str.split(",", expand=True)
)
# Only keep desired columns and ranks and fill missing values with "NA"
df = df.drop(tax_col, axis=1).fillna(
"Taxonomy unreliable - confidence threshold not met"
)
# Replace unknown taxonomy in the PR2 database
df[ranks] = df[ranks].apply(replace_pr2_tax)
return df
# Define arguments
parser = argparse.ArgumentParser(
description="A submodule for the abscale wrapper to run SINTAX on generated FASTA files."
)
parser.add_argument(
"--fastafile",
help="Input FASTA file.",
required=True,
metavar="file.fasta",
)
parser.add_argument(
"--sintax_database",
help="SINTAX database.",
metavar="/PATH/TO/DATABASE",
required=True,
)
parser.add_argument(
"--database_format",
help="Format of the database. Note: the SILVA and BOLD databases have to have a specific format.",
choices=["midori2", "pr2", "silva", "bold"],
required=True,
)
parser.add_argument(
"--sintax_confidence_cutoff",
metavar="N.N",
default="0.75",
help=("""Confidence value cutoff level, ranging from 0-1 (default=0.75)."""),
)
parser.add_argument(
"--outfile",
required=True,
metavar="OUTFILENAME.csv",
help="Name of output file in .csv format.",
)
# Set arguments
args = parser.parse_args()
ranks = ["domain", "phylum", "class", "order", "family", "genus", "species"]
# Start of pipeline
time_print(
f"Running SINTAX on {args.fastafile} with database {args.sintax_database}..."
)
# Run SINTAX on FASTA file
sintaxout = os.path.join(
os.path.dirname(args.fastafile), "apscale_wrapper_sintax_output.tsv"
)
subprocess.run(
[
"vsearch",
"--sintax",
args.fastafile,
"--db",
args.sintax_database,
"--tabbedout",
sintaxout,
"--sintax_cutoff",
args.sintax_confidence_cutoff,
"--randseed", # Required for reproducibility
"1",
"--threads", # Required for reproducibility
"1",
]
)
# Load in SINTAX output
time_print("Formatting SINTAX output...")
df = pd.read_table(
sintaxout,
header=None,
delim_whitespace=True, # Ensure consistent parsing
names=["ID", "tax_with_scores", "spacer", "tax"],
usecols=["ID", "tax_with_scores", "tax"],
)
# Revert SINTAX-specific format
df["tax"] = (
df["tax"]
.str.replace(r"(d|k|p|c|o|f|g|s):", "", regex=True)
.str.replace(r"_\d+", "", regex=True)
)
df["tax_with_scores"] = (
df["tax_with_scores"]
.str.replace(r"(d|k|p|c|o|f|g|s):", "", regex=True)
.str.replace(r"_\d+", "", regex=True)
)
# Sort by ID number
df["OTU_ESV_number"] = df["ID"].str.extract(r"_(\d+)", expand=False).astype(int)
df = df.sort_values(by="OTU_ESV_number")
df = df.drop(columns=["OTU_ESV_number"])
# Separate both dfs
df_no_cutoffs = df[["ID", "tax_with_scores"]]
df_with_cutoffs = df[["ID", "tax"]]
# Format taxonomy
df_no_cutoffs = tax_formatting(df_no_cutoffs, "tax_with_scores", ranks)
df_with_cutoffs = tax_formatting(df_with_cutoffs, "tax", ranks)
# Process and save df
df_no_cutoffs = post_processing(df_no_cutoffs)
df_with_cutoffs = post_processing(df_with_cutoffs)
# Save
df_no_cutoffs.to_csv(add_suffix(args.outfile), index=False)
df_with_cutoffs.to_csv(args.outfile, index=False)
time_print("SINTAX formatting done.")