-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcounter_m.py
144 lines (124 loc) · 5.9 KB
/
counter_m.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import math
import matplotlib.pyplot as plt
from scipy.stats import norm
class Counter_m:
"""Class used to determine the rarity of a given subsequence in a DNA sequence.
Model used: Markov chains of order m. Supposedly more precise that Counter_simple, but also
longer to compute"""
def __init__(self, sequence_lenght):
self.seq_len = sequence_lenght
self.m_len = self.seq_len - 2
self.occurrences = None
self.vocab = None
self.alphabet = ['A', 'T', 'C', 'G']
def make_chunks_word(self, s):
return [s[i:i + width] for width in {self.m_len, self.m_len + 1}
for i in range(len(s) - width + 1)]
def make_chunks(self, s):
return [s[i:i + width] for width in {self.m_len, self.m_len + 1, self.seq_len}
for i in range(len(s) - width + 1)]
def learn(self, sequence):
"""Fit the model to the learning set. Must be a LOCK CAPS string, such as 'ATCGATCGATCGATCT' """
vectorizer = CountVectorizer(tokenizer=self.make_chunks, lowercase=False)
self.occurrences = vectorizer.fit_transform(sequence)
self.occurrences = self.occurrences.toarray()
self.occurrences = self.occurrences[0]
self.vocab = vectorizer.get_feature_names()
self.vocab = np.array(self.vocab)
def expectation_gaussian(self, word):
"""Computes estimated expectancy for any word such as len(word) >= m_len - 2
Expects 3.X Python versions"""
word_len = len(word)
numerator = 1
for j in range(0, word_len-self.m_len):
word1 = word[j:j+self.m_len+1]
numerator *= int(self.occurrences[self.vocab == word1])
denom = 1
for j in range(1, word_len-self.m_len):
word1 = word[j:j+self.m_len]
denom *= int(self.occurrences[self.vocab == word1])
return numerator/denom # Expect Python 3.X versions
def __variance_gaussian(self, word):
"""This function is now useless. Please ignore"""
expect = self.expectation_gaussian(word)
variance = expect
# Computation of the independant term
indep_cst = 0
for d in range(0, self.seq_len-self.m_len-1):
if word[0:d+1] == word[self.seq_len-1-d:]:
word_repeted = ''.join(word[:d+1]+word[:self.seq_len]) # Pas de probleme avec la longueur, toujours bonne
indep_cst += self.expectation_gaussian(word_repeted)
variance += (2*indep_cst)
# Computation of the quadratic term
word_vector = CountVectorizer(tokenizer=self.make_chunks_word, lowercase=False)
word_occ = word_vector.fit_transform([word])
word_occ = word_occ.toarray()
word_occ = word_occ[0]
word_vocab = word_vector.get_feature_names()
word_vocab = np.array(word_vocab)
quad_cst = 0
for subword in word_vocab:
if len(subword) == self.m_len:
apparitions = [int(word_occ[word_vocab == subword+k]) for k in self.alphabet if (subword+k in word_vocab)]
num = sum(apparitions)
quad_cst += (num**2)/int(self.occurrences[self.vocab == subword])
elif len(subword) == self.m_len+1:
quad_cst -= (int(word_occ[word_vocab == subword])**2)/\
int(self.occurrences[self.vocab == subword])
apparitions = [int(word_occ[word_vocab == word[0:self.m_len]+k])
for k in self.alphabet if (word[0:self.m_len]+k in word_vocab)]
num = sum(apparitions)
quad_cst += (1 - 2*num)/int(self.occurrences[self.vocab == word])
variance += (expect*expect)*quad_cst
return variance
def variance_gaussian_max(self, word):
"""Computes variance of word accordingly to our model"""
expect = self.expectation_gaussian(word)
occ_word1 = int(self.occurrences[self.vocab == word[1:self.seq_len - 1]])
occ_word_end = int(self.occurrences[self.vocab == word[1:self.seq_len]])
occ_word_beg = int(self.occurrences[self.vocab == word[0:self.seq_len - 1]])
result = expect/(occ_word1**2)*(occ_word1-occ_word_end)*(occ_word1-occ_word_beg)
return result
def p_score(self, word):
"""Not working, please ignore"""
num = int(self.occurrences[self.vocab == word]) - self.expectation_gaussian(word)
std = math.sqrt(self.__variance_gaussian(word))
print("difference :", num)
print("std : ", std)
print(word)
return num / std
def p_score_max(self, word):
"""Returns the p_score of considered word. As a reminder:
the more p_score is little (big), the more the word has chances to be unexpectedly rare (present).
See readme for more info."""
num = int(self.occurrences[self.vocab == word]) - self.expectation_gaussian(word)
std = math.sqrt(self.variance_gaussian_max(word))
print("difference :", num)
print("std : ", std)
print(word)
if std == 0:
return None
return num / std
def p_scores(self):
"""Returns the list of words of the sequence. It is sorted by predicted rarity."""
obs_words = [word for word in self.vocab if len(word) == self.seq_len]
p_scores = [(self.p_score_max(word), word) for word in obs_words if self.p_score_max(word) is not None]
p_scores.sort()
return p_scores
# Example
sequence_lenght = 6
with open("D:\Pierre\Git\MOPSI\MOPSI\E_coli.txt", 'r') as f:
seq_complete = f.read().splitlines()
seq_complete = ''.join(seq_complete)
seq_complete = [seq_complete]
count = Counter_m(sequence_lenght)
count.learn(seq_complete)
p_sc = count.p_scores()
pv = [ps[0] for ps in p_sc]
# Histogram of p_scores for E_Coli DNA sequence
plt.hist(pv, bins='auto', normed=True, color='blue')
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
plt.show()