-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmulti_sample_overlap.py
233 lines (193 loc) · 9.22 KB
/
multi_sample_overlap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# Function for the assessment of overlap between multiple files of ChIP-seq data (deployed in Jupyter Lab)
# Example usage:
# spearman_peaks(prom_liver_brain_list, 10000, 'prom_liver_brain_cor_10000.txt')
# prom_liver_brain_list is a list of files of ChIP-seq peaks sorted by log10P
# 10000 is the number of peaks to use across the peak files
# prom_liver_brain_cor_10000.txt is the output file name
import os.path
from os import path
from scipy import stats
from scipy.stats import uniform, randint
from scipy.stats import spearmanr # Soearman test
# Liver overlap, function to assess correlation between files, Fig 2B/C
def spearman_peaks(file_list, max_peak_count, out_file):
out = open(out_file, 'w')
out.write('File_A_Path\tFile_B_Path\tFile_A\tFile_B\tIntersect_Count\tFile_A_Uni\tFile_B_Uni\tRho\tP\tRho_int\tP_int\n')
unique_matrix = {}
for file_a in file_list:
# Restrict to max_peak_count lines
max_file_a = add_name(file_a, pre='', suf=f'.{max_peak_count}', ext='')
if not path.exists(max_file_a):
!head -$max_peak_count $file_a > $max_file_a
# Sort by chr, pos
s_max_file_a = add_name(max_file_a, pre='', suf=f'.gsort', ext='')
if not path.exists(s_max_file_a):
!cat $max_file_a | sort -k1,1 -k2,2n > $s_max_file_a
for file_b in file_list:
# Add files to unique matrix
if file_a not in unique_matrix:
unique_matrix[file_a] = {file_b: 0}
elif file_b not in unique_matrix[file_a]:
unique_matrix[file_a][file_b] = 0
if file_b not in unique_matrix:
unique_matrix[file_b] = {file_a: 0}
elif file_a not in unique_matrix[file_b]:
unique_matrix[file_b][file_a] = 0
if unique_matrix[file_b][file_a] == 0:
# Restrict to max_peak_count lines
max_file_b = add_name(file_b, pre='', suf=f'.{max_peak_count}', ext='')
if not path.exists(max_file_b):
!head -$max_peak_count $file_b > $max_file_b
# Sort by chr, pos
s_max_file_b = add_name(max_file_b, pre='', suf=f'.gsort', ext='')
if not path.exists(s_max_file_b):
!cat $max_file_b | sort -k1,1 -k2,2n > $s_max_file_b
# Only assess different files
if file_a == file_b:
pass
else:
file_a_simp = file_a.split('/')[-1]
file_b_simp = file_b.split('/')[-1]
print(f'A: {file_a_simp}')
print(f'B: {file_b_simp}')
# Obtain peak ranks
a_peak_to_rank = {}
fa = open(max_file_a, 'r')
for a_rank, line in enumerate(fa):
a_peak_name = line.split('\t')[3]
a_peak_to_rank[a_peak_name] = a_rank
fa.close()
b_peak_to_rank = {}
fb = open(max_file_b, 'r')
for b_rank, line in enumerate(fb):
b_peak_name = line.split('\t')[3]
b_peak_to_rank[b_peak_name] = b_rank
fb.close()
# Perform AB intersections
file_ab_int = 'temp_AB_int.txt'
!bedtools intersect -b $s_max_file_a -a $s_max_file_b > $file_ab_int
file_ba_int = 'temp_BA_int.txt'
!bedtools intersect -b $s_max_file_b -a $s_max_file_a > $file_ba_int
# Open temp file
cor_file = 'temp_cor.txt'
corf = open(cor_file, 'w')
# Record intersection coords
# Coords are unique and same between AB and BA
# Each coord has one peak name
# A peak name can appear twice
coord_to_b_name = {}
ab_int = open(file_ab_int, 'r')
for line_count, line in enumerate(ab_int):
tab = line.split('\t')
peak_coord = f'{tab[0]}_{tab[1]}_{tab[2]}'
b_peak = tab[3]
coord_to_b_name[peak_coord] = b_peak
ab_int.close()
a_peak_set = set()
b_peak_set = set()
a_rank_list = []
b_rank_list = []
int_count = file_a_uni = file_b_uni = 0
ba_int = open(file_ba_int, 'r')
for line_count, line in enumerate(ba_int):
tab = line.split('\t')
peak_coord = f'{tab[0]}_{tab[1]}_{tab[2]}'
a_peak = tab[3]
b_peak = coord_to_b_name[peak_coord]
# Get the ranks
a_peak_set.add(a_peak)
b_peak_set.add(b_peak)
a_rank = a_peak_to_rank[a_peak]
b_rank = b_peak_to_rank[b_peak]
a_rank_list.append(a_rank)
b_rank_list.append(b_rank)
int_count += 1
corf.write(f'{a_peak}\t{a_rank}\t{b_rank}\t{b_peak}\n')
ba_int.close()
rho_int, p_int = spearmanr(a_rank_list, b_rank_list)
# Add the peaks outside the intersection
for a_peak in a_peak_to_rank:
if a_peak not in a_peak_set:
file_a_uni += 1
a_rank = a_peak_to_rank[a_peak]
a_rank_list.append(a_rank)
b_rank_list.append(max_peak_count)
corf.write(f'{a_peak}\t{a_rank}\t{max_peak_count}\t.\n')
for b_peak in b_peak_to_rank:
if b_peak not in b_peak_set:
file_b_uni += 1
b_rank = b_peak_to_rank[b_peak]
a_rank_list.append(max_peak_count)
b_rank_list.append(b_rank)
corf.write(f'.\t{max_peak_count}\t{b_rank}\t{b_peak}\n')
# corf.close()
rho, p = spearmanr(a_rank_list, b_rank_list)
print(rho, p)
out.write(f'{file_a}\t{file_b}\t{file_a_simp}\t{file_b_simp}\t{int_count}\t{file_a_uni}\t{file_b_uni}\t{rho}\t{p}\t{rho_int}\t{p_int}\n')
unique_matrix[file_a][file_b] = 1
unique_matrix[file_b][file_a] = 1
out.close()
def add_name(path_and_file_name, pre, suf, ext):
"""
Rename and input file, respecting the path and extension
Parameters
----------
path_and_file_name : str
path and file name, e.g., /path/to/file_name.txt
pre : str
prefix to be added before the file name
suf : str
suffix to be added after the file name
ext : str
extension to be added at the end of the file name
Returns
-------
the_output : str
modified path and file name, e.g., /path/to/prefix_file_name_suffix.extension
"""
bits = path_and_file_name.split('/')
path = '/'.join(bits[:-1])
if len(path) > 0:
path = f'{path}/'
file_name_no_path = bits[-1]
file_bits = file_name_no_path.split('.')
recognized_ext = ['csv', 'txt', 'pdf', 'png', 'bed', 'fa']
# Add a period to the extension as needed
extension_mod = ''
if ext is not None and ext != '':
if ext.startswith('.'):
extension_mod = ext
else:
extension_mod = f'.{ext}'
the_output = ''
if len(file_bits) == 1:
# No periods in the name, therefore no extension
if extension_mod is not None and extension_mod != '':
the_output = f'{path}{pre}{file_name_no_path}{suf}{extension_mod}'
else:
the_output = f'{path}{pre}{file_name_no_path}{suf}'
else:
# Periods present in file name
orig_ext = file_bits[-1]
if orig_ext not in recognized_ext:
# String after the period does not appear to be an extension
if extension_mod is not None and extension_mod != '':
the_output = f'{path}{pre}{file_name_no_path}{suf}{extension_mod}'
else:
the_output = f'{path}{pre}{file_name_no_path}{suf}'
else:
file_name_simp = '.'.join(file_bits[:-1])
if extension_mod is not None and extension_mod != '':
# New extension replaces the old one
the_output = f'{path}{pre}{file_name_simp}{suf}{extension_mod}'
else:
# Keep the old extension
the_output = f'{path}{pre}{file_name_simp}{suf}.{orig_ext}'
if the_output != path_and_file_name:
print(f'add_name output: {the_output}')
return the_output
else:
print(f'ERR: input name and output name are the same.\n'
f'Input: {path_and_file_name}\n'
f'Output: {the_output}')
return None