-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcounter_simple.py
47 lines (35 loc) · 1.66 KB
/
counter_simple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import math
"""Expectation and variance computed thanks to an approximation based on Ergodic Theorem."""
class Counter_simple:
"""Class used to determine the rarity of a given subsequence in a DNA sequence.
Model used: Markov chains"""
def __init__(self, sequence_lenght):
self.occurrences = None
self.vocab = None
self.seq_len = sequence_lenght
def make_chunks(self, s):
return [s[i:i + width] for width in {1, 2, self.seq_len - 1, self.seq_len}
for i in range(len(s) - width + 1)]
def learn(self, sequence):
vectorizer = CountVectorizer(tokenizer=self.make_chunks, lowercase=False)
self.occurrences = vectorizer.fit_transform(sequence)
self.occurrences = self.occurrences.toarray()
self.occurrences = self.occurrences[0]
self.vocab = vectorizer.get_feature_names()
self.vocab = np.array(self.vocab)
def occ(self, word):
return int(self.occurrences[self.vocab == word])
def expectation(self, word):
"""Estimator computed thanks to the Ergodic Theorem"""
return self.occ(word[:-1])*self.occ(word[-2:]) / self.occ(word[-2])
def variance(self, word):
"""Estimation of variance of expected occurences for given word"""
frac = self.occ(word)
term1 = 1 - self.occ(word[:-1])/self.occ(word[-2])
term2 = 1 - self.occ(word[-2:])/self.occ(word[-2])
return frac * term1 * term2
def p_score(self, word):
return (self.occ(word) - self.expectation(word)) / math.sqrt(self.variance(word))
# --- Examples (see Count_m)