-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck.py
77 lines (67 loc) · 2.13 KB
/
check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# -*- coding: UTF-8 -*-
# PDF spell checker
# Kero2375
import sys
try:
import textract
import language_check
import re
except ImportError:
sys.exit("Please install textract and language-check before (pip3 install *). Or mabe you're not using python3.")
try:
nf = str(sys.argv[1])
except IndexError:
sys.exit("Please add filename")
# RULES SET:
rules = [
'MORFOLOGIK_RULE_IT_IT', # errori battitura
'COMMA_PARENTHESIS_WHITESPACE', # spazi punteggiatura
'WHITESPACE_PUNCTUATION', # spazi punteggiatura
'GR_04_002', # forme abbreviate
'ST_03_001', # ad / ed ..
'ST_02_001', # frase troppo lunga
'GR_05_002', # inizio con congiunzione
'ER_01_001', # ha / a ...
'ST_01_005', # suggerimenti traduzione dall'inglese
'UPPERCASE_SENTENCE_START', # inizio minuscola
'GR_10_001' # tempi verbali
]
# IGNORED WORDS:
ignore = [
'Discalzi',
'Cannavò',
'Bari',
'Biolcati',
'Rinaldi',
'Piran',
'Vardanega',
'Cardin',
'Zucchetti',
'Grafana'
]
print ('selected file: ' + nf)
text = textract.process(nf, method='pdftotext') # extract text from pdf
text = text.decode('utf-8')
print('loading...')
tool = language_check.LanguageTool('it-IT') # language
matches = tool.check(text) # all errors
file = open('log.txt', 'w') # log file
for m in matches:
if(m.ruleId in rules): # print just selected rules
# avoid reporting '. .' (eg. in the index)
if (m.ruleId == 'COMMA_PARENTHESIS_WHITESPACE'):
if('. .' not in str(m)):
file.write(str(m) + '\n\n')
# if any ignored word into message text, don't print it
elif (m.ruleId == 'MORFOLOGIK_RULE_IT_IT'):
t=0
for i in ignore:
if(i in str(m)):
t = 1
if(t == 0):
file.write(str(m) + '\n\n')
# print everything else
else:
file.write(str(m) + '\n\n')
file.close()
print('all done! check the file log.txt')