Merge pull request #81 from martinghunt/fix_assemble_not_only_assembler

Fix assemble not only assembler
sanger-pathogens · Jan 24, 2017 · 1fbf2a8 · 1fbf2a8
2 parents c8e2049 + 7b50684
commit 1fbf2a8
Show file tree

Hide file tree

Showing 11 changed files with 111 additions and 26 deletions.
diff --git a/circlator/bamfilter.py b/circlator/bamfilter.py
@@ -11,6 +11,7 @@ def __init__(
              self,
              bam,
              outprefix,
+             fastq_out=False,
              length_cutoff=100000,
              min_read_length=250,
              contigs_to_use=None,
@@ -22,8 +23,9 @@ def __init__(
         if not os.path.exists(self.bam):
             raise Error('File not found:' + self.bam)
 
+        self.fastq_out = fastq_out
         self.length_cutoff = length_cutoff
-        self.reads_fa = os.path.abspath(outprefix + '.fasta')
+        self.reads_outfile = os.path.abspath(outprefix + ('.fastq' if self.fastq_out else '.fasta'))
         self.log = os.path.abspath(outprefix + '.log')
         self.log_prefix = log_prefix
         self.contigs_to_use = self._get_contigs_to_use(contigs_to_use)
@@ -69,15 +71,15 @@ def _all_reads_from_contig(self, contig, fout):
         '''Gets all reads from contig called "contig" and writes to fout'''
         sam_reader = pysam.Samfile(self.bam, "rb")
         for read in sam_reader.fetch(contig):
-            print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)
+            print(mapping.aligned_read_to_read(read, ignore_quality=not self.fastq_out), file=fout)
 
 
     def _get_all_unmapped_reads(self, fout):
         '''Writes all unmapped reads to fout'''
         sam_reader = pysam.Samfile(self.bam, "rb")
         for read in sam_reader.fetch(until_eof=True):
             if read.is_unmapped:
-                print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)
+                print(mapping.aligned_read_to_read(read, ignore_quality=not self.fastq_out), file=fout)
 
 
     def _break_reads(self, contig, position, fout, min_read_length=250):
@@ -88,15 +90,15 @@ def _break_reads(self, contig, position, fout, min_read_length=250):
             if read.pos < position < read.reference_end - 1:
                 split_point = position - read.pos
                 if split_point - 1 >= min_read_length:
-                    sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(0, split_point)
+                    sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=not self.fastq_out).subseq(0, split_point)
                     sequence.id += '.left'
                     seqs.append(sequence)
                 if read.query_length - split_point >= min_read_length:
-                    sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(split_point, read.query_length)
+                    sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=not self.fastq_out).subseq(split_point, read.query_length)
                     sequence.id += '.right'
                     seqs.append(sequence)
             else:
-                seqs.append(mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True))
+                seqs.append(mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=not self.fastq_out))
 
             for seq in seqs:
                 if read.is_reverse:
@@ -111,7 +113,7 @@ def _exclude_region(self, contig, start, end, fout):
         for read in sam_reader.fetch(contig):
             read_interval = pyfastaq.intervals.Interval(read.pos, read.reference_end - 1)
             if not read_interval.intersects(exclude_interval):
-                print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)
+                print(mapping.aligned_read_to_read(read, ignore_quality=not self.fastq_out), file=fout)
 
 
     def _get_region(self, contig, start, end, fout, min_length=250):
@@ -120,15 +122,17 @@ def _get_region(self, contig, start, end, fout, min_length=250):
         trimming_end = (start == 0)
         for read in sam_reader.fetch(contig, start, end):
             read_interval = pyfastaq.intervals.Interval(read.pos, read.reference_end - 1)
-            seq = mapping.aligned_read_to_read(read, ignore_quality=True, revcomp=False)
+            seq = mapping.aligned_read_to_read(read, ignore_quality=not self.fastq_out, revcomp=False)
 
             if trimming_end:
                 bases_off_start = 0
                 bases_off_end = max(0, read.reference_end - 1 - end)
-                seq.seq = seq.seq[:read.query_alignment_end - bases_off_end]
+                #seq.seq = seq.seq[:read.query_alignment_end - bases_off_end]
+                seq = seq.subseq(0, read.query_alignment_end - bases_off_end)
             else:
                 bases_off_start = max(0, start - read.pos + 1)
-                seq.seq = seq.seq[bases_off_start  + read.query_alignment_start:]
+                #seq.seq = seq.seq[bases_off_start  + read.query_alignment_start:]
+                seq = seq.subseq(bases_off_start  + read.query_alignment_start, len(seq))
 
             if read.is_reverse:
                 seq.revcomp()
@@ -141,7 +145,7 @@ def run(self):
         ref_lengths = self._get_ref_lengths()
         assert len(ref_lengths) > 0
         f_log = pyfastaq.utils.open_file_write(self.log)
-        f_fa = pyfastaq.utils.open_file_write(self.reads_fa)
+        f_fa = pyfastaq.utils.open_file_write(self.reads_outfile)
         print(self.log_prefix, '#contig', 'length', 'reads_kept', sep='\t', file=f_log)
         if self.verbose:
             print('Getting reads from BAM file', self.bam, flush=True)
@@ -187,4 +191,4 @@ def run(self):
         if self.verbose:
             print('Finished getting reads.')
             print('Log file:', self.log)
-            print('Reads file:', self.reads_fa, flush=True)
+            print('Reads file:', self.reads_outfile, flush=True)
diff --git a/circlator/merge.py b/circlator/merge.py
@@ -694,12 +694,12 @@ def _iterative_bridged_contig_pair_merge(self, outprefix):
                 )
 
                 reads_prefix = outprefix + '.iter.' + str(iteration) + '.reads'
-                reads_to_map =  reads_prefix + '.fasta'
-                bam_filter = circlator.bamfilter.BamFilter(bam, reads_prefix)
+                reads_to_map =  reads_prefix + ('.fasta' if self.spades_only_assembler else '.fastq')
+                bam_filter = circlator.bamfilter.BamFilter(bam, reads_prefix, fastq_out=not self.spades_only_assembler)
                 bam_filter.run()
                 assembler_dir = outprefix + '.iter.' + str(iteration) + '.assembly'
                 a = circlator.assemble.Assembler(
-                    reads_prefix + '.fasta',
+                    reads_prefix + ('.fasta' if self.spades_only_assembler else '.fastq'),
                     assembler_dir,
                     threads=self.threads,
                     careful=self.spades_careful,

diff --git a/circlator/tasks/all.py b/circlator/tasks/all.py
@@ -15,12 +15,12 @@ def print_message(m, opts):
 def run():
     parser = argparse.ArgumentParser(
         description = 'Run mapreads, bam2reads, assemble, merge, clean, fixstart',
-        usage = 'circlator all [options] <assembly.fasta> <reads.fasta> <output directory>')
+        usage = 'circlator all [options] <assembly.fasta> <reads.fasta/q> <output directory>')
     parser.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
     parser.add_argument('--verbose', action='store_true', help='Be verbose')
     parser.add_argument('--unchanged_code', type=int, help='Code to return when the input assembly is not changed [%(default)s]', default=0, metavar='INT')
     parser.add_argument('assembly', help='Name of original assembly', metavar='assembly.fasta')
-    parser.add_argument('reads', help='Name of corrected reads FASTA file', metavar='reads.fasta')
+    parser.add_argument('reads', help='Name of corrected reads FASTA or FASTQ file', metavar='reads.fasta/q')
     parser.add_argument('outdir', help='Name of output directory (must not already exist)', metavar='output directory')
 
     mapreads_group = parser.add_argument_group('mapreads options')
@@ -36,7 +36,7 @@ def run():
     parser.add_argument('--assemble_spades_k', help='Comma separated list of kmers to use when running SPAdes. Max kmer is 127 and each kmer should be an odd integer [%(default)s]', default='127,117,107,97,87,77', metavar='k1,k2,k3,...')
     parser.add_argument('--assemble_spades_use_first', action='store_true', help='Use the first successful SPAdes assembly. Default is to try all kmers and use the assembly with the largest N50')
     parser.add_argument('--assemble_not_careful', action='store_true', help='Do not use the --careful option with SPAdes (used by default)')
-    parser.add_argument('--assemble_not_only_assembler', action='store_true', help='Do not use the --assemble-only option with SPAdes (used by default)')
+    parser.add_argument('--assemble_not_only_assembler', action='store_true', help='Do not use the --assemble-only option with SPAdes (used by default). Important: with this option, the input reads must be in FASTQ format, otherwise SPAdes will crash because it needs quality scores to correct the reads.')
 
     merge_group = parser.add_argument_group('merge options')
     merge_group.add_argument('--merge_diagdiff', type=int, help='Nucmer diagdiff option [%(default)s]', metavar='INT', default=25)
@@ -103,7 +103,7 @@ def run():
     original_assembly_renamed = '00.input_assembly.fasta'
     bam = '01.mapreads.bam'
     filtered_reads_prefix = '02.bam2reads'
-    filtered_reads =  filtered_reads_prefix + '.fasta'
+    filtered_reads =  filtered_reads_prefix + ('.fastq' if options.assemble_not_only_assembler else '.fasta')
     assembly_dir = '03.assemble'
     reassembly = os.path.join(assembly_dir, 'contigs.fasta')
     merge_prefix = '04.merge'
@@ -137,6 +137,7 @@ def run():
     bam_filter = circlator.bamfilter.BamFilter(
         bam,
         filtered_reads_prefix,
+        fastq_out=options.assemble_not_only_assembler,
         length_cutoff=options.b2r_length_cutoff,
         min_read_length=options.b2r_min_read_length,
         contigs_to_use=options.b2r_only_contigs,

diff --git a/circlator/tasks/bam2reads.py b/circlator/tasks/bam2reads.py
@@ -8,6 +8,7 @@ def run():
         description = 'Make reads from mapping to be reassembled',
         usage = 'circlator bam2reads [options] <in.bam> <outprefix>')
     parser.add_argument('--discard_unmapped', action='store_true', help='Use this to not keep unmapped reads')
+    parser.add_argument('--fastq', action='store_true', help='Write fastq output (if quality scores are present in input BAM file)')
     parser.add_argument('--only_contigs', help='File of contig names (one per line). Only reads that map to these contigs are kept (and unmapped reads, unless --discard_unmapped is used).', metavar='FILENAME')
     parser.add_argument('--length_cutoff', type=int, help='All reads mapped to contigs shorter than this will be kept [%(default)s]', default=100000, metavar='INT')
     parser.add_argument('--min_read_length', type=int, help='Minimum length of read to output [%(default)s]', default=250, metavar='INT')
@@ -19,6 +20,7 @@ def run():
     bam_filter = circlator.bamfilter.BamFilter(
         options.bam,
         options.outprefix,
+        fastq_out=options.fastq,
         length_cutoff=options.length_cutoff,
         min_read_length=options.min_read_length,
         contigs_to_use=options.only_contigs,

diff --git a/circlator/tests/bamfilter_test.py b/circlator/tests/bamfilter_test.py
@@ -119,11 +119,11 @@ def test_get_region_end(self):
         os.unlink(tmp)
 
 
-    def test_run_keep_unmapped(self):
-        '''test run keep unmapped'''
+    def test_run_keep_unmapped_no_quals(self):
+        '''test run keep unmapped bam has no quality scores'''
         outprefix = 'tmp.bamfilter_run'
         b = bamfilter.BamFilter(
-            os.path.join(data_dir, 'bamfilter_test_run.bam'),
+            os.path.join(data_dir, 'bamfilter_test_run_no_qual.bam'),
             outprefix,
             length_cutoff=600,
             min_read_length=100,
@@ -135,12 +135,58 @@ def test_run_keep_unmapped(self):
         os.unlink(outprefix + '.fasta')
         os.unlink(outprefix + '.log')
 
+        b = bamfilter.BamFilter(
+            os.path.join(data_dir, 'bamfilter_test_run_no_qual.bam'),
+            outprefix,
+            fastq_out=True,
+            length_cutoff=600,
+            min_read_length=100,
+            contigs_to_use={'contig1', 'contig3', 'contig4'}
+        )
+        b.run()
+        expected = os.path.join(data_dir, 'bamfilter_test_run_keep_unmapped.out.reads.fa')
+        self.assertTrue(filecmp.cmp(expected, outprefix + '.fastq', shallow=False))
+        os.unlink(outprefix + '.fastq')
+        os.unlink(outprefix + '.log')
+
+
+    def test_run_keep_unmapped_with_quals(self):
+        '''test run keep unmapped bam has quality scores'''
+        outprefix = 'tmp.bamfilter_run'
+        b = bamfilter.BamFilter(
+            os.path.join(data_dir, 'bamfilter_test_run_with_qual.bam'),
+            outprefix,
+            fastq_out=False,
+            length_cutoff=600,
+            min_read_length=100,
+            contigs_to_use={'contig1', 'contig3', 'contig4'}
+        )
+        b.run()
+        expected = os.path.join(data_dir, 'bamfilter_test_run_keep_unmapped.out.reads.fa')
+        self.assertTrue(filecmp.cmp(expected, outprefix + '.fasta', shallow=False))
+        os.unlink(outprefix + '.fasta')
+        os.unlink(outprefix + '.log')
+
+        b = bamfilter.BamFilter(
+            os.path.join(data_dir, 'bamfilter_test_run_with_qual.bam'),
+            outprefix,
+            fastq_out=True,
+            length_cutoff=600,
+            min_read_length=100,
+            contigs_to_use={'contig1', 'contig3', 'contig4'}
+        )
+        b.run()
+        expected = os.path.join(data_dir, 'bamfilter_test_run_keep_unmapped.out.reads.fq')
+        self.assertTrue(filecmp.cmp(expected, outprefix + '.fastq', shallow=False))
+        os.unlink(outprefix + '.fastq')
+        os.unlink(outprefix + '.log')
+
 
-    def test_run_discard_unmapped(self):
-        '''test run keep unmapped'''
+    def test_run_discard_unmapped_no_quals(self):
+        '''test run keep unmapped bam has no quality scores'''
         outprefix = 'tmp.bamfilter_run'
         b = bamfilter.BamFilter(
-            os.path.join(data_dir, 'bamfilter_test_run.bam'),
+            os.path.join(data_dir, 'bamfilter_test_run_no_qual.bam'),
             outprefix,
             length_cutoff=600,
             min_read_length=100,

diff --git a/circlator/tests/data/bamfilter_test_run_keep_unmapped.out.reads.fq b/circlator/tests/data/bamfilter_test_run_keep_unmapped.out.reads.fq
@@ -0,0 +1,32 @@
+@contig1:1-100
+TTTAATTTGTCCGTAAATTGGGAGGTCTTCAACCGGGGGCGAATGTCGATCTCGTCGAGGCGTTTGTAAAGTGGTAACAGGGGTCATTGATCACGGTGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
+@contig1:301-400
+CGTTGATGATACGAATTACGTAGGGCTCTGGGAGATGCTCGGAACCCCACAGCGTCTATTTTAGTTGCGACATTACGCGGTATGCGCTTCTGCAAGATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
+@contig4:1-100
+TCGAAAGTACACTTTGAACTCTAAAAGCGGTTACGACCTCTTCCGTTCGATCGATGCGTGAGTACGTACTCTGGATCCAGCCGTGGCAAACCGGGTAACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
+@contig4:201-350
+CCTCCGCCTGCCTTTGACACACCGGACCTCGGGGGTGTCTAAAAGCCGTCCGTGTGTTGGATAGACATTTGTGACCGTATAGCGGGATGACGTTTCTCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@contig4:651-801
+TGAAGGAGGGGGAGTCGTGCCCGTATCTGGGCCCAGTATATACATTGGGCAGGAGGGTTTGTCAAGAATTCTATCCTTACTAGTCTATTTTCGATACGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGFDBAA
+@contig4:851-950
+GAGTACGAGGGACAGATGTCTACACTTGAGCGTACACAAGAATGTGGTACCAAAGGTATCCTCATCGCAACTGGCATTCAAGCCGCTGTTCGACAGTGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
+@contig4:901-1000
+CAAAGGTATCCTCATCGCAACTGGCATTCAAGCCGCTGTTCGACAGTGGGTCTGTTGTACCCCTCTGCCCAACTGCTGAGTAGTTGGGTAAGGACCGAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
+@unmapped_read
+TTATGGTACTTCGTTGCTCCCAAGGCTGAACTGATACATAGAGTGGGCTTTGTGATAGAACCAAACGACAACGAAGCGAATTTCGTCACCATCTCCATAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
diff --git a/circlator/tests/data/bamfilter_test_run.bam → ...tests/data/bamfilter_test_run_no_qual.bam b/circlator/tests/data/bamfilter_test_run.bam → ...tests/data/bamfilter_test_run_no_qual.bam
diff --git a/...tor/tests/data/bamfilter_test_run.bam.bai → ...s/data/bamfilter_test_run_no_qual.bam.bai b/...tor/tests/data/bamfilter_test_run.bam.bai → ...s/data/bamfilter_test_run_no_qual.bam.bai
diff --git a/circlator/tests/data/bamfilter_test_run_with_qual.bam b/circlator/tests/data/bamfilter_test_run_with_qual.bam
diff --git a/circlator/tests/data/bamfilter_test_run_with_qual.bam.bai b/circlator/tests/data/bamfilter_test_run_with_qual.bam.bai
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setup(
     name='circlator',
-    version='1.4.0',
+    version='1.4.1',
     description='circlator: a tool to circularise genome assemblies',
     packages = find_packages(),
     package_data={'circlator': ['data/*']},