Skip to content

Commit

Permalink
Merge pull request #81 from martinghunt/fix_assemble_not_only_assembler
Browse files Browse the repository at this point in the history
Fix assemble not only assembler
  • Loading branch information
martinghunt authored Jan 24, 2017
2 parents c8e2049 + 7b50684 commit 1fbf2a8
Show file tree
Hide file tree
Showing 11 changed files with 111 additions and 26 deletions.
28 changes: 16 additions & 12 deletions circlator/bamfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def __init__(
self,
bam,
outprefix,
fastq_out=False,
length_cutoff=100000,
min_read_length=250,
contigs_to_use=None,
Expand All @@ -22,8 +23,9 @@ def __init__(
if not os.path.exists(self.bam):
raise Error('File not found:' + self.bam)

self.fastq_out = fastq_out
self.length_cutoff = length_cutoff
self.reads_fa = os.path.abspath(outprefix + '.fasta')
self.reads_outfile = os.path.abspath(outprefix + ('.fastq' if self.fastq_out else '.fasta'))
self.log = os.path.abspath(outprefix + '.log')
self.log_prefix = log_prefix
self.contigs_to_use = self._get_contigs_to_use(contigs_to_use)
Expand Down Expand Up @@ -69,15 +71,15 @@ def _all_reads_from_contig(self, contig, fout):
'''Gets all reads from contig called "contig" and writes to fout'''
sam_reader = pysam.Samfile(self.bam, "rb")
for read in sam_reader.fetch(contig):
print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)
print(mapping.aligned_read_to_read(read, ignore_quality=not self.fastq_out), file=fout)


def _get_all_unmapped_reads(self, fout):
'''Writes all unmapped reads to fout'''
sam_reader = pysam.Samfile(self.bam, "rb")
for read in sam_reader.fetch(until_eof=True):
if read.is_unmapped:
print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)
print(mapping.aligned_read_to_read(read, ignore_quality=not self.fastq_out), file=fout)


def _break_reads(self, contig, position, fout, min_read_length=250):
Expand All @@ -88,15 +90,15 @@ def _break_reads(self, contig, position, fout, min_read_length=250):
if read.pos < position < read.reference_end - 1:
split_point = position - read.pos
if split_point - 1 >= min_read_length:
sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(0, split_point)
sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=not self.fastq_out).subseq(0, split_point)
sequence.id += '.left'
seqs.append(sequence)
if read.query_length - split_point >= min_read_length:
sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(split_point, read.query_length)
sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=not self.fastq_out).subseq(split_point, read.query_length)
sequence.id += '.right'
seqs.append(sequence)
else:
seqs.append(mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True))
seqs.append(mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=not self.fastq_out))

for seq in seqs:
if read.is_reverse:
Expand All @@ -111,7 +113,7 @@ def _exclude_region(self, contig, start, end, fout):
for read in sam_reader.fetch(contig):
read_interval = pyfastaq.intervals.Interval(read.pos, read.reference_end - 1)
if not read_interval.intersects(exclude_interval):
print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)
print(mapping.aligned_read_to_read(read, ignore_quality=not self.fastq_out), file=fout)


def _get_region(self, contig, start, end, fout, min_length=250):
Expand All @@ -120,15 +122,17 @@ def _get_region(self, contig, start, end, fout, min_length=250):
trimming_end = (start == 0)
for read in sam_reader.fetch(contig, start, end):
read_interval = pyfastaq.intervals.Interval(read.pos, read.reference_end - 1)
seq = mapping.aligned_read_to_read(read, ignore_quality=True, revcomp=False)
seq = mapping.aligned_read_to_read(read, ignore_quality=not self.fastq_out, revcomp=False)

if trimming_end:
bases_off_start = 0
bases_off_end = max(0, read.reference_end - 1 - end)
seq.seq = seq.seq[:read.query_alignment_end - bases_off_end]
#seq.seq = seq.seq[:read.query_alignment_end - bases_off_end]
seq = seq.subseq(0, read.query_alignment_end - bases_off_end)
else:
bases_off_start = max(0, start - read.pos + 1)
seq.seq = seq.seq[bases_off_start + read.query_alignment_start:]
#seq.seq = seq.seq[bases_off_start + read.query_alignment_start:]
seq = seq.subseq(bases_off_start + read.query_alignment_start, len(seq))

if read.is_reverse:
seq.revcomp()
Expand All @@ -141,7 +145,7 @@ def run(self):
ref_lengths = self._get_ref_lengths()
assert len(ref_lengths) > 0
f_log = pyfastaq.utils.open_file_write(self.log)
f_fa = pyfastaq.utils.open_file_write(self.reads_fa)
f_fa = pyfastaq.utils.open_file_write(self.reads_outfile)
print(self.log_prefix, '#contig', 'length', 'reads_kept', sep='\t', file=f_log)
if self.verbose:
print('Getting reads from BAM file', self.bam, flush=True)
Expand Down Expand Up @@ -187,4 +191,4 @@ def run(self):
if self.verbose:
print('Finished getting reads.')
print('Log file:', self.log)
print('Reads file:', self.reads_fa, flush=True)
print('Reads file:', self.reads_outfile, flush=True)
6 changes: 3 additions & 3 deletions circlator/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,12 +694,12 @@ def _iterative_bridged_contig_pair_merge(self, outprefix):
)

reads_prefix = outprefix + '.iter.' + str(iteration) + '.reads'
reads_to_map = reads_prefix + '.fasta'
bam_filter = circlator.bamfilter.BamFilter(bam, reads_prefix)
reads_to_map = reads_prefix + ('.fasta' if self.spades_only_assembler else '.fastq')
bam_filter = circlator.bamfilter.BamFilter(bam, reads_prefix, fastq_out=not self.spades_only_assembler)
bam_filter.run()
assembler_dir = outprefix + '.iter.' + str(iteration) + '.assembly'
a = circlator.assemble.Assembler(
reads_prefix + '.fasta',
reads_prefix + ('.fasta' if self.spades_only_assembler else '.fastq'),
assembler_dir,
threads=self.threads,
careful=self.spades_careful,
Expand Down
9 changes: 5 additions & 4 deletions circlator/tasks/all.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ def print_message(m, opts):
def run():
parser = argparse.ArgumentParser(
description = 'Run mapreads, bam2reads, assemble, merge, clean, fixstart',
usage = 'circlator all [options] <assembly.fasta> <reads.fasta> <output directory>')
usage = 'circlator all [options] <assembly.fasta> <reads.fasta/q> <output directory>')
parser.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
parser.add_argument('--verbose', action='store_true', help='Be verbose')
parser.add_argument('--unchanged_code', type=int, help='Code to return when the input assembly is not changed [%(default)s]', default=0, metavar='INT')
parser.add_argument('assembly', help='Name of original assembly', metavar='assembly.fasta')
parser.add_argument('reads', help='Name of corrected reads FASTA file', metavar='reads.fasta')
parser.add_argument('reads', help='Name of corrected reads FASTA or FASTQ file', metavar='reads.fasta/q')
parser.add_argument('outdir', help='Name of output directory (must not already exist)', metavar='output directory')

mapreads_group = parser.add_argument_group('mapreads options')
Expand All @@ -36,7 +36,7 @@ def run():
parser.add_argument('--assemble_spades_k', help='Comma separated list of kmers to use when running SPAdes. Max kmer is 127 and each kmer should be an odd integer [%(default)s]', default='127,117,107,97,87,77', metavar='k1,k2,k3,...')
parser.add_argument('--assemble_spades_use_first', action='store_true', help='Use the first successful SPAdes assembly. Default is to try all kmers and use the assembly with the largest N50')
parser.add_argument('--assemble_not_careful', action='store_true', help='Do not use the --careful option with SPAdes (used by default)')
parser.add_argument('--assemble_not_only_assembler', action='store_true', help='Do not use the --assemble-only option with SPAdes (used by default)')
parser.add_argument('--assemble_not_only_assembler', action='store_true', help='Do not use the --assemble-only option with SPAdes (used by default). Important: with this option, the input reads must be in FASTQ format, otherwise SPAdes will crash because it needs quality scores to correct the reads.')

merge_group = parser.add_argument_group('merge options')
merge_group.add_argument('--merge_diagdiff', type=int, help='Nucmer diagdiff option [%(default)s]', metavar='INT', default=25)
Expand Down Expand Up @@ -103,7 +103,7 @@ def run():
original_assembly_renamed = '00.input_assembly.fasta'
bam = '01.mapreads.bam'
filtered_reads_prefix = '02.bam2reads'
filtered_reads = filtered_reads_prefix + '.fasta'
filtered_reads = filtered_reads_prefix + ('.fastq' if options.assemble_not_only_assembler else '.fasta')
assembly_dir = '03.assemble'
reassembly = os.path.join(assembly_dir, 'contigs.fasta')
merge_prefix = '04.merge'
Expand Down Expand Up @@ -137,6 +137,7 @@ def run():
bam_filter = circlator.bamfilter.BamFilter(
bam,
filtered_reads_prefix,
fastq_out=options.assemble_not_only_assembler,
length_cutoff=options.b2r_length_cutoff,
min_read_length=options.b2r_min_read_length,
contigs_to_use=options.b2r_only_contigs,
Expand Down
2 changes: 2 additions & 0 deletions circlator/tasks/bam2reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ def run():
description = 'Make reads from mapping to be reassembled',
usage = 'circlator bam2reads [options] <in.bam> <outprefix>')
parser.add_argument('--discard_unmapped', action='store_true', help='Use this to not keep unmapped reads')
parser.add_argument('--fastq', action='store_true', help='Write fastq output (if quality scores are present in input BAM file)')
parser.add_argument('--only_contigs', help='File of contig names (one per line). Only reads that map to these contigs are kept (and unmapped reads, unless --discard_unmapped is used).', metavar='FILENAME')
parser.add_argument('--length_cutoff', type=int, help='All reads mapped to contigs shorter than this will be kept [%(default)s]', default=100000, metavar='INT')
parser.add_argument('--min_read_length', type=int, help='Minimum length of read to output [%(default)s]', default=250, metavar='INT')
Expand All @@ -19,6 +20,7 @@ def run():
bam_filter = circlator.bamfilter.BamFilter(
options.bam,
options.outprefix,
fastq_out=options.fastq,
length_cutoff=options.length_cutoff,
min_read_length=options.min_read_length,
contigs_to_use=options.only_contigs,
Expand Down
58 changes: 52 additions & 6 deletions circlator/tests/bamfilter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,11 @@ def test_get_region_end(self):
os.unlink(tmp)


def test_run_keep_unmapped(self):
'''test run keep unmapped'''
def test_run_keep_unmapped_no_quals(self):
'''test run keep unmapped bam has no quality scores'''
outprefix = 'tmp.bamfilter_run'
b = bamfilter.BamFilter(
os.path.join(data_dir, 'bamfilter_test_run.bam'),
os.path.join(data_dir, 'bamfilter_test_run_no_qual.bam'),
outprefix,
length_cutoff=600,
min_read_length=100,
Expand All @@ -135,12 +135,58 @@ def test_run_keep_unmapped(self):
os.unlink(outprefix + '.fasta')
os.unlink(outprefix + '.log')

b = bamfilter.BamFilter(
os.path.join(data_dir, 'bamfilter_test_run_no_qual.bam'),
outprefix,
fastq_out=True,
length_cutoff=600,
min_read_length=100,
contigs_to_use={'contig1', 'contig3', 'contig4'}
)
b.run()
expected = os.path.join(data_dir, 'bamfilter_test_run_keep_unmapped.out.reads.fa')
self.assertTrue(filecmp.cmp(expected, outprefix + '.fastq', shallow=False))
os.unlink(outprefix + '.fastq')
os.unlink(outprefix + '.log')


def test_run_keep_unmapped_with_quals(self):
'''test run keep unmapped bam has quality scores'''
outprefix = 'tmp.bamfilter_run'
b = bamfilter.BamFilter(
os.path.join(data_dir, 'bamfilter_test_run_with_qual.bam'),
outprefix,
fastq_out=False,
length_cutoff=600,
min_read_length=100,
contigs_to_use={'contig1', 'contig3', 'contig4'}
)
b.run()
expected = os.path.join(data_dir, 'bamfilter_test_run_keep_unmapped.out.reads.fa')
self.assertTrue(filecmp.cmp(expected, outprefix + '.fasta', shallow=False))
os.unlink(outprefix + '.fasta')
os.unlink(outprefix + '.log')

b = bamfilter.BamFilter(
os.path.join(data_dir, 'bamfilter_test_run_with_qual.bam'),
outprefix,
fastq_out=True,
length_cutoff=600,
min_read_length=100,
contigs_to_use={'contig1', 'contig3', 'contig4'}
)
b.run()
expected = os.path.join(data_dir, 'bamfilter_test_run_keep_unmapped.out.reads.fq')
self.assertTrue(filecmp.cmp(expected, outprefix + '.fastq', shallow=False))
os.unlink(outprefix + '.fastq')
os.unlink(outprefix + '.log')


def test_run_discard_unmapped(self):
'''test run keep unmapped'''
def test_run_discard_unmapped_no_quals(self):
'''test run keep unmapped bam has no quality scores'''
outprefix = 'tmp.bamfilter_run'
b = bamfilter.BamFilter(
os.path.join(data_dir, 'bamfilter_test_run.bam'),
os.path.join(data_dir, 'bamfilter_test_run_no_qual.bam'),
outprefix,
length_cutoff=600,
min_read_length=100,
Expand Down
32 changes: 32 additions & 0 deletions circlator/tests/data/bamfilter_test_run_keep_unmapped.out.reads.fq
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
@contig1:1-100
TTTAATTTGTCCGTAAATTGGGAGGTCTTCAACCGGGGGCGAATGTCGATCTCGTCGAGGCGTTTGTAAAGTGGTAACAGGGGTCATTGATCACGGTGTA
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
@contig1:301-400
CGTTGATGATACGAATTACGTAGGGCTCTGGGAGATGCTCGGAACCCCACAGCGTCTATTTTAGTTGCGACATTACGCGGTATGCGCTTCTGCAAGATGG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
@contig4:1-100
TCGAAAGTACACTTTGAACTCTAAAAGCGGTTACGACCTCTTCCGTTCGATCGATGCGTGAGTACGTACTCTGGATCCAGCCGTGGCAAACCGGGTAACA
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
@contig4:201-350
CCTCCGCCTGCCTTTGACACACCGGACCTCGGGGGTGTCTAAAAGCCGTCCGTGTGTTGGATAGACATTTGTGACCGTATAGCGGGATGACGTTTCTCTG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@contig4:651-801
TGAAGGAGGGGGAGTCGTGCCCGTATCTGGGCCCAGTATATACATTGGGCAGGAGGGTTTGTCAAGAATTCTATCCTTACTAGTCTATTTTCGATACGCG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGFDBAA
@contig4:851-950
GAGTACGAGGGACAGATGTCTACACTTGAGCGTACACAAGAATGTGGTACCAAAGGTATCCTCATCGCAACTGGCATTCAAGCCGCTGTTCGACAGTGGG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
@contig4:901-1000
CAAAGGTATCCTCATCGCAACTGGCATTCAAGCCGCTGTTCGACAGTGGGTCTGTTGTACCCCTCTGCCCAACTGCTGAGTAGTTGGGTAAGGACCGAGT
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
@unmapped_read
TTATGGTACTTCGTTGCTCCCAAGGCTGAACTGATACATAGAGTGGGCTTTGTGATAGAACCAAACGACAACGAAGCGAATTTCGTCACCATCTCCATAA
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
File renamed without changes.
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name='circlator',
version='1.4.0',
version='1.4.1',
description='circlator: a tool to circularise genome assemblies',
packages = find_packages(),
package_data={'circlator': ['data/*']},
Expand Down

0 comments on commit 1fbf2a8

Please sign in to comment.