From d72797477026ba9f0fc623dd4755bbdc8fa51282 Mon Sep 17 00:00:00 2001 From: Remi Denise Date: Tue, 4 Jan 2022 23:33:09 +0100 Subject: [PATCH] :zap: adding improvement from pull request #219 Adding the improvement about the pull request #219 to remove the `-c` in prodigal in metagenome and adding the `--partialgenes` option. Also adding improvment (1) from issue #474 --- bin/prokka | 129 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 85 insertions(+), 44 deletions(-) diff --git a/bin/prokka b/bin/prokka index d2ad9d2..77d568b 100755 --- a/bin/prokka +++ b/bin/prokka @@ -38,6 +38,7 @@ use Bio::Seq; use Bio::SeqFeature::Generic; use Bio::Tools::GFF; use Bio::Tools::GuessSeqFormat; +use Bio::Location::Fuzzy; # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # global variables @@ -233,7 +234,7 @@ my(@Options, $quiet, $debug, $kingdom, $fast, $force, $outdir, $prefix, $cpus, $ $genus, $species, $strain, $plasmid, $prodigaltf, $usegenus, $proteins, $hmms, $centre, $scaffolds, $rfam, $norrna, $notrna, $rnammer, $rawproduct, $noanno, $accver, - $metagenome, $compliant, $listdb, $citation); + $metagenome, $partialgenes, $compliant, $listdb, $citation); $dbdir = $ENV{'PROKKA_DBDIR'} || abs_path("$FindBin::RealBin/../db"); @@ -323,7 +324,7 @@ msg("Annotating as >>> $kingdom <<<"); # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # check if --setupdb has been run -if ( ! -r "$dbdir/kingdom/$kingdom/sprot.pin" or ! -r "$dbdir/hmm/HAMAP.hmm.h3i") { +if ( ! -r "$dbdir/kingdom/$kingdom/sprot.pin" or ! "$dbdir/hmm/HAMAP.hmm.h3i") { err("The sequence databases have not been indexed. Please run 'prokka --setupdb' first."); } @@ -709,52 +710,86 @@ if ($tools{'minced'}->{HAVE}) { # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # CDS +# Create a hash of RNA lists, grouped by seqid to speed up lookups. +my %rnabyseq; +for my $rna (@allrna) { + my @a = (); + $rnabyseq{$rna->seq_id} = \@a; +} +for my $rna (@allrna) { + my $a = $rnabyseq{$rna->seq_id}; + push @$a, $rna; +} + msg("Predicting coding sequences"); +my $tool = "Prodigal:".$tools{prodigal}->{VERSION}; my $totalbp = sum( map { $seq{$_}{DNA}->length } @seq); my $prodigal_mode = ($totalbp >= 100000 && !$metagenome) ? 'single' : 'meta'; -msg("Contigs total $totalbp bp, so using $prodigal_mode mode"); +my $prodigal_closedends = $partialgenes ? '' : '-c'; +msg("Contigs total $totalbp bp, so using $prodigal_mode mode, " . + ($partialgenes ? 'not' : '') . " allowing genes to run off contig edges."); my $num_cds=0; -my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E -c -m -g $gcode -p $prodigal_mode -f sco -q"; +my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E $prodigal_closedends -m -g $gcode -p $prodigal_mode -f gff -q"; if ($prodigaltf and -r $prodigaltf) { msg("Gene finding will be aided by Prodigal training file: $prodigaltf"); $cmd .= " -t '$prodigaltf'"; } msg("Running: $cmd"); open my $PRODIGAL, '-|', $cmd; -my $sid; -while (<$PRODIGAL>) { - if (m/seqhdr="([^\s\"]+)"/) { - $sid = $1; -# msg("CDS $sid"); - next; +my $gff_in = Bio::Tools::GFF->new(-fh => $PRODIGAL, + -gff_version => 3); +while (my $cds = $gff_in->next_feature){ + my $sid = $cds->seq_id; + # Determine whether the CDS is partial, i.e. it is running off + # contig edges. Add the appropriate /codon_start attribute if necessary + # Assuming prodigal only returns one single 'partial' tag + # Modify start & end attributes, because gff output is wrong + my ($partial) = $cds->get_tag_values('partial'); + my ($codon_start); + my ($start, $end) = ($cds->start, $cds->end); + my $ctg_length = $seq{$sid}{DNA}->length; + if ($partial ne '00') { + if ($partial =~ /^1/){ + $codon_start = $cds->start if ($cds->strand == 1); + $start = '<1'; } - elsif (m/^>\d+_(\d+)_(\d+)_([+-])$/) { - my $tool = "Prodigal:".$tools{prodigal}->{VERSION}; # FIXME: why inner loop? - my $cds = Bio::SeqFeature::Generic->new( - -primary => 'CDS', - -seq_id => $sid, - -source => $tool, - -start => $1, - -end => $2, - -strand => ($3 eq '+' ? +1 : -1), - -score => undef, - -frame => 0, - -tag => { - 'inference' => "ab initio prediction:$tool", - } - ); + if ($partial =~ /1$/){ + $codon_start = ($ctg_length - $cds->end + 1) + if ($cds->strand == -1); + $end = ">$ctg_length"; + } + my $fuzzyloc = Bio::Location::Fuzzy->new( + -start => $start, + -end => $end, + -location_type => '..', + -strand => $cds->strand); + $cds->location($fuzzyloc); + } + # Removing all tags + foreach my $tag ($cds->get_all_tags) { + $cds->remove_tag($tag); + } + # Adding only codon_start, if applicable, and the inference tag + if ($codon_start){ + $cds->add_tag_value('codon_start', $codon_start); + $cds->frame($codon_start-1); + } + $cds->add_tag_value('inference', "ab initio prediction:$tool"); my $overlap; - for my $rna (@allrna) { - # same contig, overlapping (could check same strand too? not sure) - if ($rna->seq_id eq $sid and $cds->overlaps($rna)) { - $overlap = $rna; - last; - } + if (!$cds_rna_olap and exists($rnabyseq{$sid})) { + my $seqrna = $rnabyseq{$sid}; + for my $rna (@$seqrna) { + # same contig, overlapping (could check same strand too? not sure) + if ($rna->seq_id eq $sid and $cds->overlaps($rna)) { + $overlap = $rna; + last; + } + } } # mitochondria are highly packed, so don't exclude as CDS/tRNA often overlap. - if ($overlap and ! $cds_rna_olap) { + if ($overlap) { my $type = $overlap->primary_tag; - msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$1..$2 on $3 strand"); + msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$start..$end on strand " . $cds->strand); } else { $num_cds++; @@ -765,7 +800,6 @@ while (<$PRODIGAL>) { } } - } } msg("Found $num_cds CDS"); @@ -800,7 +834,7 @@ if ($tools{signalp}->{HAVE}) { for my $f (@{ $seq{$sid}{FEATURE} }) { next unless $f->primary_tag eq 'CDS'; $cds{++$count} = $f; - my $seq = $f->seq->translate(-codontable_id=>$gcode, -complete=>1); + my $seq = $f->seq->translate(-codontable_id=>$gcode); $seq->display_id($count); $spout->write_seq($seq); } @@ -1079,7 +1113,7 @@ else { next if $f->has_tag('product'); $cds{++$count} = $f; print $faa ">$count\n", - $f->seq->translate(-codontable_id=>$gcode, -complete=>1)->seq,"\n"; + $f->seq->translate(-codontable_id=>$gcode)->seq,"\n"; } } close $faa; @@ -1232,9 +1266,6 @@ for my $sid (@seq) { my $g = Bio::SeqFeature::Generic->new( -primary => 'gene', -seq_id => $f->seq_id, - -start => $f->start, - -end => $f->end, - -strand => $f->strand, -source_tag => $EXE, -tag => { 'locus_tag' => $ID, @@ -1244,6 +1275,7 @@ for my $sid (@seq) { # Make a Parent tag from the CDS to the gene $f->add_tag_value('Parent', $gene_id); # copy the /gene from the CDS + $g->location($f->location); if (my $gENE = TAG($f, 'gene')) { $g->add_tag_value('gene', $gENE); } @@ -1318,12 +1350,19 @@ for my $sid (@seq) { if (my $name = TAG($f, 'gene')) { $f->add_tag_value('Name', $name); } - # Make sure we have valid frames/phases (GFF column 8) - $f->frame( $f->primary_tag eq 'CDS' ? 0 : '.' ); + + # Make sure we have valid frames/phases (GFF column 8: 0/1/2 for CDS, + # . for other primary tags.) + $f->frame( $f->primary_tag eq 'CDS' ? $f->frame : '.' ); print $gff_fh $f->gff_string($gff_factory),"\n"; - my ($L,$R) = ($f->strand >= 0) ? ($f->start,$f->end) : ($f->end,$f->start); + my ($start, $end) = ($f->start, $f->end); + $start = (($f->strand >= 0) ? '<' : '>') . $start + if ($f->location->start_pos_type eq 'BEFORE'); + $end = (($f->strand >= 0) ? '>' : '<') . $end + if ($f->location->end_pos_type eq 'AFTER'); + my ($L, $R) = ($f->strand >= 0) ? ($start, $end) : ($end, $start); print {$tbl_fh} "$L\t$R\t",$f->primary_tag,"\n"; for my $tag ($f->get_all_tags) { next if $tag =~ m/^[A-Z]/ and $tag !~ m/EC_number/i; # remove GFF specific tags (start with uppercase letter) @@ -1343,7 +1382,8 @@ for my $sid (@seq) { # $ffn_fh->write_seq($p); # } if ($f->primary_tag eq 'CDS') { - $faa_fh->write_seq( $p->translate(-codontable_id=>$gcode, -complete=>1) ); + $faa_fh->write_seq( $p->translate(-codontable_id=>$gcode, + -frame => $f->frame) ); } if ($f->primary_tag =~ m/^(CDS|rRNA|tmRNA|tRNA|misc_RNA)$/) { $ffn_fh->write_seq($p); @@ -1519,7 +1559,7 @@ sub find_exe { sub msg { my $t = localtime; - my $line = "[".$t->hms."] @_\n"; + my $line = "[".$t->dmy." ".$t->hms."] @_\n"; print STDERR $line unless $quiet; if (openhandle(\*LOG)) { # write out any buffered log lines @@ -1767,6 +1807,7 @@ sub setOptions { {OPT=>"proteins=s", VAR=>\$proteins, DEFAULT=>'', DESC=>"FASTA or GBK file to use as 1st priority"}, {OPT=>"hmms=s", VAR=>\$hmms, DEFAULT=>'', DESC=>"Trusted HMM to first annotate from"}, {OPT=>"metagenome!", VAR=>\$metagenome, DEFAULT=>0, DESC=>"Improve gene predictions for highly fragmented genomes"}, + {OPT=>"partialgenes!", VAR=>\$partialgenes, DEFAULT=>0, DESC=>"Allow genes to run off edges, yielding incomplete genes (no closed ends option in prodigal)"}, {OPT=>"rawproduct!", VAR=>\$rawproduct, DEFAULT=>0, DESC=>"Do not clean up /product annotation"}, {OPT=>"cdsrnaolap!", VAR=>\$cds_rna_olap, DEFAULT=>0, DESC=>"Allow [tr]RNA to overlap CDS"}, 'Matching:',