forked from TorHou/merge_pcr_duplicates
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge_pcr_duplicates.py
119 lines (103 loc) · 4.28 KB
/
merge_pcr_duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pysam
from collections import Counter
import argparse
from Bio import SeqIO
bam_data = []
score = Counter()
endResult = []
fastq_data = {}
bam_filter = []
merge_data = []
def get_bam_filter(line, chrom_bam):
if line.is_reverse:
strand = '-'
else:
strand = '+'
if 'N' not in fastq_data[line.query_name]:
bam_filter.append((line.reference_start,
line.reference_start + line.reference_length,
line.query_name, strand,
chrom_bam, fastq_data[line.query_name]))
return bam_filter
def bam_reader(bam_file, xs_tag):
global merge_data
bam = pysam.AlignmentFile(bam_file, "rb")
filtered_data = []
chromosomes = bam.references
for chrom in chromosomes:
data = bam.fetch(chrom, multiple_iterators=True)
for entry in data:
try:
chrom_bam = bam.get_reference_name(entry.reference_id)
except:
continue
if (entry.is_unmapped == False and chrom_bam == chrom):
if xs_tag and entry.has_tag("XS"):
continue
else:
filtered_data = get_bam_filter(entry, chrom_bam)
chromosome_info(filtered_data)
print_results(merge_data)
merge_data = []
bam.close()
def fastq_reader(fastq_file):
fastq_dt = {}
for fastq_data in SeqIO.parse(fastq_file, "fastq"):
fastq_dt[str(fastq_data.id)] = str(fastq_data.seq)
return fastq_dt
def chromosome_counter(i, bam_data):
if args.ends:
merge_checks = (bam_data[i][4], bam_data[i][0], bam_data[i][1], bam_data[i][5], bam_data[i][3])
else:
merge_checks = (bam_data[i][4], bam_data[i][0], bam_data[i][5], bam_data[i][3])
score[merge_checks] += 1
if score[merge_checks] == 1:
merge_data.append((bam_data[i][4], bam_data[i][0], bam_data[i][1], bam_data[i][2], bam_data[i][3]))
#append_merge = merge_data.append
#if score[merge_checks] == 1:
#append_merge((bam_data[i][4], bam_data[i][0], bam_data[i][1], bam_data[i][2], bam_data[i][3]))
def chromosome_info(bam_data):
chr_info = []
for i in range(len(bam_data)):
rec_id = bam_data[i][2]
if rec_id in fastq_data:
chromosome_counter(i, bam_data)
else:
print(rec_id + " ID not found in fastq file")
def print_results(endResult):
if endResult != [] :
with open(args.output_file, "w") as f:
for entry in endResult:
if args.ends:
sc = score[(entry[0], entry[1], entry[2], fastq_data[entry[3]], entry[4])]
else:
sc = score[(entry[0], entry[1], fastq_data[entry[3]], entry[4])]
f.write('\t'.join( str(x) for x in [entry[0], entry[1], entry[2], entry[3], sc, entry[4]] ) + '\n')
tool_description = """
Merge PCR duplicates according to a random barcode library.
Input:
* bam file containing fastq read-id and other details
* fastq library of random barcodes
Output:
* bed6 file with random barcode in name field and number of PCR duplicates as
score, sorted by fields chrom, start, stop, strand, name
Example usage:
python3 pysam_test.py example1.bam example2.fa
"""
epilog = """
Author: Fayyaz Hussain
Status: Testing
"""
# parse command line arguments
parser = argparse.ArgumentParser(description=tool_description,
epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("bam_file", help="Path to bam file containing alignments.", metavar='BAM_File')
parser.add_argument("fastq_file", help="Path to fastq barcode library.", metavar='FASTQ_File')
parser.add_argument("-o", "--output_file", required=True, help="Write results to this file.",
metavar='Output_File')
parser.add_argument("--filter_by_nh_tag", action='store_true', default=False, help="Mapped reads with XS tag will be excluded.")
parser.add_argument("-e", "--ends", action='store_true', default=False, help="Consider sequence end coordinates when merging the PCR duplicates. By default reads with the same chromosome, strand, start and barcode are merged")
args = parser.parse_args()
args.filter_by_nh_tag
fastq_data = fastq_reader(args.fastq_file)
bam_reader(args.bam_file, args.filter_by_nh_tag)