-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsetencesCorpus.py
105 lines (92 loc) · 3.88 KB
/
setencesCorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# -*- coding: utf-8 -*-
import os
import logging
import codecs
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import PlaintextCorpusReader
from settings import Settings
from datetime import datetime
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import lda
class ReadSentences(object):
def __init__(self):
self.s = Settings()
self.createLog()
def createLog(self):
log_file = "log" + datetime.now().strftime("%d%m%Y_%M_%H")
self.log_file = os.path.join(self.s.path, "log", log_file)
logging.basicConfig(filename=self.log_file, format='%(levelname)s:%(message)s', level=logging.INFO)
def getJudge(self, text):
judge = ""
pattern = re.compile(r"Este documento é cópia do original, assinado digitalmente por ([A-Z ]*), liberado nos autos em", re.MULTILINE)
for match in pattern.finditer(text):
match_return = match.groups()
print(match_return[0])
judge = match_return[0]
return judge
def readText(self, file):
text = ""
with codecs.open(os.path.join(self.s.path, "leis", file), "r", "UTF-8") as handle:
for line in handle.readlines():
line = line.rstrip().lower()
line = re.sub("[0-9]+", "", line)
line = re.sub("[_]+", "", line)
text += " " + line
line = re.sub("\s+", " ", text)
return text
def tdm(self):
vectorizer = CountVectorizer(strip_accents="unicode", max_df =0.8)
list_files = os.listdir(os.path.join(self.s.path, "leis"))
list_docs = [self.readText(x) for x in list_files]
x1 = vectorizer.fit_transform(list_docs)
# create dataFrame
print()
df = pd.DataFrame(x1.toarray().transpose(), index=vectorizer.get_feature_names())
df.columns = list_files
return df
def stop_words(self):
ponctuation = [".", "!", "-", "?", ",", "lei", "artigo", ")", "(", "subchefia", ":", "presidente", "$", "°"]
ponctuation.append(stopwords.words("portuguese"))
return ponctuation
def tokenizer(self):
# portuguese_sent_tokenizer = nltk.data.load("tokenizers/punkt/portuguese.pickle")
tokens = nltk.word_tokenize(self.readText())
print(len(tokens))
# nltk_text = nltk.Text(tokens)
# print(type(nltk_text))
fd1 = nltk.FreqDist(tokens)
for f in fd1:
print(f, fd1[f])
def read(self):
portuguese_sent_tokenizer = nltk.data.load("tokenizers/punkt/portuguese.pickle")
newcorpus = PlaintextCorpusReader(os.path.join(self.s.path, "leis"), ".*", sent_tokenizer=portuguese_sent_tokenizer)
ponctuation = [".","!","-","?",",","lei","artigo",")","(","subchefia",":","presidente","$", "°"]
#print(newcorpus.words())
for files in newcorpus.fileids():
words = newcorpus.words(files)
words = [w.lower() for w in words]
filtered_words = [word for word in words if word not in stopwords.words("portuguese")]
filtered_words = [word for word in filtered_words if word not in ponctuation]
fd1 = nltk.FreqDist(filtered_words)
print(files, fd1.most_common(10))
if __name__ == "__main__":
r = ReadSentences()
p = r.tdm()
print(p.shape)
vocab = p.index.values
print(vocab[2:20])
p.reset_index(drop=True)
model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)
model.fit(p.as_matrix())
topic_word = model.topic_word_
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
print('Topic {}: {}'.format(i, ' '.join(topic_words)))
#p.to_csv(os.path.join(r.s.path, "tabela2.csv"))
#print(stopwords.words("portuguese"))