forked from TorHou/merge_pcr_duplicates
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrosslink_quality_check.py
182 lines (142 loc) · 5.39 KB
/
crosslink_quality_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from matplotlib.backends.backend_pdf import PdfPages
from scipy import stats
import os
import matplotlib.pyplot as plt
import numpy
import pandas
import logging
import argparse
####################
## ARGS INPUT ##
####################
tool_description = """
This tool checks the reproducibility of the crosslinking sites between samples in fasta format.
Ideally the trend should follow a diagonal line with the highest reproducible motif in the right
upper most corner.
By default output is written to source file location.
Example usage:
crosslink_quality_check.py file1.fasta file2.fasta kmer_length -o output.pdf
"""
# parse command line arguments
parser = argparse.ArgumentParser(description=tool_description,
formatter_class=argparse.RawDescriptionHelpFormatter)
# positional arguments
parser.add_argument(
"file1",
help="Path to first fasta file.")
parser.add_argument(
'file2',
nargs='+',
help="Path to the other fasta files. Specify at least one more file for file1 to be compared to.")
parser.add_argument(
'kmer_length',
type=int,
help="Length of the kmers. Keep in mind that the sequences should be long enough.")
# optional arguments
parser.add_argument(
"-o", "--outfile",
help="Write results to this file.")
parser.add_argument(
"-d", "--debug",
help="Print lots of debugging information",
action="store_true")
args = parser.parse_args()
if args.debug:
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
else:
logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
logging.info("Parsed arguments:")
logging.info(" file1: '{}'".format(args.file1))
logging.info(" file2: '{}'".format(args.file2))
logging.info(" kmer_length: '{}'".format(args.kmer_length))
if args.outfile:
logging.info(" outfile: enabled writing to file")
logging.info(" outfile: '{}'".format(args.outfile))
logging.info("")
if args.kmer_length <= 1:
raise Exception("[ERROR] kmer length too short. Your kmer length makes no sense.")
###################
## READ DATA ##
###################
print("[START]")
print("[NOTE] Read data")
# your read data in bed6 format
files = [args.file1]
files.extend(args.file2)
# read first line of first file to get length of the sequences
tmp = open(files[0])
firstline = tmp.readline()
# we assume that the middle of the sequence is the crosslink nucleotide
cl_nucleotide = int(len(firstline)/2)
# get starting and end point for the kmers
start_iter = cl_nucleotide - args.kmer_length + 1
end_iter = cl_nucleotide + args.kmer_length + 1
tmp.close()
print("[NOTE] finish")
######################
## PROCESS DATA ##
######################
print("[NOTE] Process data")
# create a dictornary for the kmers
kmer_dict = dict()
f = 0
for file in files:
with open(file) as openfileobject:
for seq in openfileobject:
# check length of kmer
if len(seq) < (args.kmer_length*2 - 1):
raise Exception("[ERROR] kmer length too long, in other words, sequence is too short.")
# go over your sequence and generate kmers of length args.kmer_length
# put kmers into dictonary
for i in range(start_iter, cl_nucleotide+1):
kmer = seq[i:i+args.kmer_length]
# if kmer already exists, then increment for file f
if kmer in kmer_dict:
kmer_dict[kmer][f] += 1
# if kmer does not exist, then create a new vector of size len(files)
else:
init = numpy.zeros(len(files))
init[f] = 1
kmer_dict[kmer] = init
openfileobject.close()
f +=1
sum_vector = numpy.zeros(len(files))
kmer_values_vectors = kmer_dict.values()
# get the total number of kmers for each files (sample)
for values_vector in kmer_values_vectors:
sum_vector += values_vector
# calculate the realtive abundance of each kmer for each file (sample)
for kmer in kmer_dict:
kmer_dict[kmer] = kmer_dict[kmer]/sum_vector
print("[NOTE] finish")
##############
## PLOT ##
##############
print("[NOTE] Make plots")
plotpath = os.path.dirname(os.path.abspath(__file__)) + '/'
# create a plot and list of motifs with their sorted relative abundance for each pair of files
for i in range(0,len(files)):
for j in range(i+1, len(files)):
outfile_name = ""
if args.outfile:
outfile_name = args.outfile
else:
outfile_name = plotpath + 'Crosslink_Kmer_Quality_Check_' + str(i) + '_' + str(j) + '.pdf'
pp = PdfPages(outfile_name)
df = pandas.DataFrame(kmer_dict).T
# do linear regression for the two files
slope, intercept, r_value, p_value, std_err = stats.linregress(df[i], df[j])
plt.plot(df[i], df[j], ls='', marker='.', ms=10.0)
plt.ylabel(files[i])
plt.xlabel(files[j])
max_x = max(df[i])
max_y = max(df[j])
plt.text(max_x/3, max_y/2 + max_y*.2, "R" + r'$^2 =$' + " " + str(r_value), fontsize=15)
pp.savefig()
pp.close()
outfile_name = 'test-data/reproducible_motifs_' + str(i) + '_' + str(j) + '.csv'
df_sorted = df[[i,j]]
df_sorted = df_sorted.sort_values([i, j], ascending=[False, False])
df_sorted.columns = [files[i], files[j]]
df_sorted.to_csv(outfile_name, sep='\t')
print("[FINISH]")