Skip to content

Commit

Permalink
⚡ adding improvement from pull request tseemann#219
Browse files Browse the repository at this point in the history
Adding the improvement about the pull request tseemann#219 to remove the `-c`  in prodigal in metagenome and adding the `--partialgenes` option.
Also adding improvment (1) from issue tseemann#474
  • Loading branch information
rdenise authored Jan 4, 2022
1 parent c335b7a commit d727974
Showing 1 changed file with 85 additions and 44 deletions.
129 changes: 85 additions & 44 deletions bin/prokka
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ use Bio::Seq;
use Bio::SeqFeature::Generic;
use Bio::Tools::GFF;
use Bio::Tools::GuessSeqFormat;
use Bio::Location::Fuzzy;

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# global variables
Expand Down Expand Up @@ -233,7 +234,7 @@ my(@Options, $quiet, $debug, $kingdom, $fast, $force, $outdir, $prefix, $cpus, $
$genus, $species, $strain, $plasmid, $prodigaltf,
$usegenus, $proteins, $hmms, $centre, $scaffolds,
$rfam, $norrna, $notrna, $rnammer, $rawproduct, $noanno, $accver,
$metagenome, $compliant, $listdb, $citation);
$metagenome, $partialgenes, $compliant, $listdb, $citation);

$dbdir = $ENV{'PROKKA_DBDIR'} || abs_path("$FindBin::RealBin/../db");

Expand Down Expand Up @@ -323,7 +324,7 @@ msg("Annotating as >>> $kingdom <<<");
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# check if --setupdb has been run

if ( ! -r "$dbdir/kingdom/$kingdom/sprot.pin" or ! -r "$dbdir/hmm/HAMAP.hmm.h3i") {
if ( ! -r "$dbdir/kingdom/$kingdom/sprot.pin" or ! "$dbdir/hmm/HAMAP.hmm.h3i") {
err("The sequence databases have not been indexed. Please run 'prokka --setupdb' first.");
}

Expand Down Expand Up @@ -709,52 +710,86 @@ if ($tools{'minced'}->{HAVE}) {
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# CDS

# Create a hash of RNA lists, grouped by seqid to speed up lookups.
my %rnabyseq;
for my $rna (@allrna) {
my @a = ();
$rnabyseq{$rna->seq_id} = \@a;
}
for my $rna (@allrna) {
my $a = $rnabyseq{$rna->seq_id};
push @$a, $rna;
}

msg("Predicting coding sequences");
my $tool = "Prodigal:".$tools{prodigal}->{VERSION};
my $totalbp = sum( map { $seq{$_}{DNA}->length } @seq);
my $prodigal_mode = ($totalbp >= 100000 && !$metagenome) ? 'single' : 'meta';
msg("Contigs total $totalbp bp, so using $prodigal_mode mode");
my $prodigal_closedends = $partialgenes ? '' : '-c';
msg("Contigs total $totalbp bp, so using $prodigal_mode mode, " .
($partialgenes ? 'not' : '') . " allowing genes to run off contig edges.");
my $num_cds=0;
my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E -c -m -g $gcode -p $prodigal_mode -f sco -q";
my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E $prodigal_closedends -m -g $gcode -p $prodigal_mode -f gff -q";
if ($prodigaltf and -r $prodigaltf) {
msg("Gene finding will be aided by Prodigal training file: $prodigaltf");
$cmd .= " -t '$prodigaltf'";
}
msg("Running: $cmd");
open my $PRODIGAL, '-|', $cmd;
my $sid;
while (<$PRODIGAL>) {
if (m/seqhdr="([^\s\"]+)"/) {
$sid = $1;
# msg("CDS $sid");
next;
my $gff_in = Bio::Tools::GFF->new(-fh => $PRODIGAL,
-gff_version => 3);
while (my $cds = $gff_in->next_feature){
my $sid = $cds->seq_id;
# Determine whether the CDS is partial, i.e. it is running off
# contig edges. Add the appropriate /codon_start attribute if necessary
# Assuming prodigal only returns one single 'partial' tag
# Modify start & end attributes, because gff output is wrong
my ($partial) = $cds->get_tag_values('partial');
my ($codon_start);
my ($start, $end) = ($cds->start, $cds->end);
my $ctg_length = $seq{$sid}{DNA}->length;
if ($partial ne '00') {
if ($partial =~ /^1/){
$codon_start = $cds->start if ($cds->strand == 1);
$start = '<1';
}
elsif (m/^>\d+_(\d+)_(\d+)_([+-])$/) {
my $tool = "Prodigal:".$tools{prodigal}->{VERSION}; # FIXME: why inner loop?
my $cds = Bio::SeqFeature::Generic->new(
-primary => 'CDS',
-seq_id => $sid,
-source => $tool,
-start => $1,
-end => $2,
-strand => ($3 eq '+' ? +1 : -1),
-score => undef,
-frame => 0,
-tag => {
'inference' => "ab initio prediction:$tool",
}
);
if ($partial =~ /1$/){
$codon_start = ($ctg_length - $cds->end + 1)
if ($cds->strand == -1);
$end = ">$ctg_length";
}
my $fuzzyloc = Bio::Location::Fuzzy->new(
-start => $start,
-end => $end,
-location_type => '..',
-strand => $cds->strand);
$cds->location($fuzzyloc);
}
# Removing all tags
foreach my $tag ($cds->get_all_tags) {
$cds->remove_tag($tag);
}
# Adding only codon_start, if applicable, and the inference tag
if ($codon_start){
$cds->add_tag_value('codon_start', $codon_start);
$cds->frame($codon_start-1);
}
$cds->add_tag_value('inference', "ab initio prediction:$tool");
my $overlap;
for my $rna (@allrna) {
# same contig, overlapping (could check same strand too? not sure)
if ($rna->seq_id eq $sid and $cds->overlaps($rna)) {
$overlap = $rna;
last;
}
if (!$cds_rna_olap and exists($rnabyseq{$sid})) {
my $seqrna = $rnabyseq{$sid};
for my $rna (@$seqrna) {
# same contig, overlapping (could check same strand too? not sure)
if ($rna->seq_id eq $sid and $cds->overlaps($rna)) {
$overlap = $rna;
last;
}
}
}
# mitochondria are highly packed, so don't exclude as CDS/tRNA often overlap.
if ($overlap and ! $cds_rna_olap) {
if ($overlap) {
my $type = $overlap->primary_tag;
msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$1..$2 on $3 strand");
msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$start..$end on strand " . $cds->strand);
}
else {
$num_cds++;
Expand All @@ -765,7 +800,6 @@ while (<$PRODIGAL>) {
}

}
}
}
msg("Found $num_cds CDS");

Expand Down Expand Up @@ -800,7 +834,7 @@ if ($tools{signalp}->{HAVE}) {
for my $f (@{ $seq{$sid}{FEATURE} }) {
next unless $f->primary_tag eq 'CDS';
$cds{++$count} = $f;
my $seq = $f->seq->translate(-codontable_id=>$gcode, -complete=>1);
my $seq = $f->seq->translate(-codontable_id=>$gcode);
$seq->display_id($count);
$spout->write_seq($seq);
}
Expand Down Expand Up @@ -1079,7 +1113,7 @@ else {
next if $f->has_tag('product');
$cds{++$count} = $f;
print $faa ">$count\n",
$f->seq->translate(-codontable_id=>$gcode, -complete=>1)->seq,"\n";
$f->seq->translate(-codontable_id=>$gcode)->seq,"\n";
}
}
close $faa;
Expand Down Expand Up @@ -1232,9 +1266,6 @@ for my $sid (@seq) {
my $g = Bio::SeqFeature::Generic->new(
-primary => 'gene',
-seq_id => $f->seq_id,
-start => $f->start,
-end => $f->end,
-strand => $f->strand,
-source_tag => $EXE,
-tag => {
'locus_tag' => $ID,
Expand All @@ -1244,6 +1275,7 @@ for my $sid (@seq) {
# Make a Parent tag from the CDS to the gene
$f->add_tag_value('Parent', $gene_id);
# copy the /gene from the CDS
$g->location($f->location);
if (my $gENE = TAG($f, 'gene')) {
$g->add_tag_value('gene', $gENE);
}
Expand Down Expand Up @@ -1318,12 +1350,19 @@ for my $sid (@seq) {
if (my $name = TAG($f, 'gene')) {
$f->add_tag_value('Name', $name);
}
# Make sure we have valid frames/phases (GFF column 8)
$f->frame( $f->primary_tag eq 'CDS' ? 0 : '.' );

# Make sure we have valid frames/phases (GFF column 8: 0/1/2 for CDS,
# . for other primary tags.)
$f->frame( $f->primary_tag eq 'CDS' ? $f->frame : '.' );

print $gff_fh $f->gff_string($gff_factory),"\n";

my ($L,$R) = ($f->strand >= 0) ? ($f->start,$f->end) : ($f->end,$f->start);
my ($start, $end) = ($f->start, $f->end);
$start = (($f->strand >= 0) ? '<' : '>') . $start
if ($f->location->start_pos_type eq 'BEFORE');
$end = (($f->strand >= 0) ? '>' : '<') . $end
if ($f->location->end_pos_type eq 'AFTER');
my ($L, $R) = ($f->strand >= 0) ? ($start, $end) : ($end, $start);
print {$tbl_fh} "$L\t$R\t",$f->primary_tag,"\n";
for my $tag ($f->get_all_tags) {
next if $tag =~ m/^[A-Z]/ and $tag !~ m/EC_number/i; # remove GFF specific tags (start with uppercase letter)
Expand All @@ -1343,7 +1382,8 @@ for my $sid (@seq) {
# $ffn_fh->write_seq($p);
# }
if ($f->primary_tag eq 'CDS') {
$faa_fh->write_seq( $p->translate(-codontable_id=>$gcode, -complete=>1) );
$faa_fh->write_seq( $p->translate(-codontable_id=>$gcode,
-frame => $f->frame) );
}
if ($f->primary_tag =~ m/^(CDS|rRNA|tmRNA|tRNA|misc_RNA)$/) {
$ffn_fh->write_seq($p);
Expand Down Expand Up @@ -1519,7 +1559,7 @@ sub find_exe {

sub msg {
my $t = localtime;
my $line = "[".$t->hms."] @_\n";
my $line = "[".$t->dmy." ".$t->hms."] @_\n";
print STDERR $line unless $quiet;
if (openhandle(\*LOG)) {
# write out any buffered log lines
Expand Down Expand Up @@ -1767,6 +1807,7 @@ sub setOptions {
{OPT=>"proteins=s", VAR=>\$proteins, DEFAULT=>'', DESC=>"FASTA or GBK file to use as 1st priority"},
{OPT=>"hmms=s", VAR=>\$hmms, DEFAULT=>'', DESC=>"Trusted HMM to first annotate from"},
{OPT=>"metagenome!", VAR=>\$metagenome, DEFAULT=>0, DESC=>"Improve gene predictions for highly fragmented genomes"},
{OPT=>"partialgenes!", VAR=>\$partialgenes, DEFAULT=>0, DESC=>"Allow genes to run off edges, yielding incomplete genes (no closed ends option in prodigal)"},
{OPT=>"rawproduct!", VAR=>\$rawproduct, DEFAULT=>0, DESC=>"Do not clean up /product annotation"},
{OPT=>"cdsrnaolap!", VAR=>\$cds_rna_olap, DEFAULT=>0, DESC=>"Allow [tr]RNA to overlap CDS"},
'Matching:',
Expand Down

0 comments on commit d727974

Please sign in to comment.