-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlanguage_detection.py
89 lines (83 loc) · 4.04 KB
/
language_detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
from collections import Counter
import re
if __name__=="__main__":
print("\nWhich example you want to test (0,1,2,3 or 4)?")
example=input()
current_directory = os.getcwd()
corpus_dir = os.path.join(current_directory,"publicDataSet","public","set",example,"corpus")
sequences_dir = os.path.join(current_directory,"publicDataSet","public","set",example,"sequences.txt")
data={}
corpus_bigrams={}
corpus_occurances={}
counter=Counter() #used to count bigram occurances
i=0 #used to index a language in languages
print("\n--------EXTRACTING AND COUNTING BIGRAMS--------\n")
for root, subfolders, files in os.walk(corpus_dir):
if root==corpus_dir:
languages=subfolders
if os.path.basename(root) in languages:
counter.clear() #clear the counter if you start processing a new language
i+=1
for file in files: # processing txt files for the current language
f=open(os.path.join(root,file),'r',encoding='utf-8')
tekst=f.read()
tekst=tekst.lower()
pattern=re.compile(r'(?=(..))',flags=re.IGNORECASE) #define regex pattern
bigrams = re.findall(pattern,tekst) #find all bigrams with given pattern
counter.update(bigrams) #update the bigram count, with the count from the current txt file
f.close()
if not subfolders:
data[languages[i-1]]=list(counter.items())
corpus_bigrams[languages[i-1]]=list(counter.keys())
corpus_occurances[languages[i-1]]=list(counter.values())
# sorting the data
languages.sort()
sliced_data={}
for lg in languages:
data[lg].sort(key=lambda tup: tup[0])
data[lg].sort(key=lambda tup: tup[1], reverse=True)
sliced_data[lg]=data[lg][0:5]
# printing the language,bigram and count
for tuples in sliced_data[lg]:
print(lg+','+tuples[0]+f',{tuples[1]}')
f.close()
print("\n--------Bigrams successfully extracted--------\n")
#SECOND PART, Calculating the probabilities for given sequences
print("\n--------CALCULATING PROBABILITIES--------\n")
f2=open(sequences_dir,'r',encoding='utf-8')
sequences=f2.read().splitlines()
#evaluating P(Li|text)=P(text|Li)/sum_for_every_i(P(text|Li))
#this formula is correct only if apriori probabilities P(Li) are all the same (for every i)
P_text_c_Li={}
data2={}
counter.clear()
for lg in languages:
P_text_c_Li[lg]=[]
all_occur=sum(o for b,o in data[lg]) # sum of all occurances for given language
for lines in sequences:
lines.strip()
lines=lines.lower()
pattern2=re.compile(r"(?=(..))",flags=re.IGNORECASE) # define pattern for bigrams
bigrams2 = re.findall(pattern2,lines) # find all bigrams for given pattern
counter.update(bigrams2)
data2[lg]=list(counter.items())
probability=1
for (bigram,occur) in data2[lg]:
if bigram in corpus_bigrams[lg]: # if bigram exist in the corpus
bigram_occur_in_corpus=corpus_occurances[lg][corpus_bigrams[lg].index(bigram)]
P_xy_c_Li=bigram_occur_in_corpus/all_occur # P(xy|Li)
probability*=P_xy_c_Li*occur
else:
probability=0
counter.clear()
P_text_c_Li[lg].append(probability) # P(text|Li)
for i in range(0,len(sequences)):
print(f"Probabilities for the {i+1}. sequence:")
for j in range(0,len(languages)):
print(languages[j]+',',end='')
total_probability=sum(x[i] for x in P_text_c_Li.values())# sum_for_every_i(P(text|Li))
if P_text_c_Li[languages[j]][i]==0:
print(f'{0}')
else:
print(f'{P_text_c_Li[languages[j]][i]/total_probability}')