-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlint_po_args.py
executable file
·195 lines (172 loc) · 7.36 KB
/
lint_po_args.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python3
"""
Checks for grave translation errors and typos, like translating "please use
the -0 option" as "bitte nutze die -O Option", which is a different
command-line option and likely won't work.
In particular, this script checks:
- Command-line flags
- printf-style format instructions
See the regex for details.
"""
import argparse
import collections
from typing import Any, List, Optional
import re
RE_PRINTFISH = re.compile(r"%[0-9a-zA-Z+-]+", re.M)
RE_CMDLINE_OPTION = re.compile(r"(?<![0-9a-zA-Z_-])-[0-9a-zA-Z_-]+", re.M)
# In-line unit test:
assert RE_PRINTFISH.findall("-foo bar --baz and %quux the -4") == ["%quux"]
assert RE_CMDLINE_OPTION.findall("-foo bar --baz and %quux the -4") == ["-foo", "--baz", "-4"]
ESCAPE_DICT = {
"t": "\t",
"n": "\n",
"\\": "\\",
'"': '"',
}
Translation = collections.namedtuple("Translation", ["msgid", "msgstr", "line_first"])
Issue = collections.namedtuple("Issue", ["translation", "reason"])
def unescape(line: str) -> str:
# Behold, the most inefficient but (hopefully) definitely-correct unescaper you have ever seen!
parts = []
is_quoted = False
is_escaping = False
for char in line:
if is_escaping:
is_escaping = False
assert char in ESCAPE_DICT, "unknown escape sequence >>\\%s<<" % char
parts.append(ESCAPE_DICT[char])
continue
if char == '"':
is_quoted = not is_quoted
continue
assert is_quoted, "char %s not inside doublequotes; string is not properly escaped" % char
if char == "\\":
is_escaping = True
continue
parts.append(char)
assert not is_quoted, "escaped string is not properly terminated"
assert not is_escaping, "unfinished escape sequence; also, string not terminated"
return "".join(parts)
# In-line unit test:
assert unescape('""') == ""
assert unescape('"asdf"') == "asdf"
assert unescape('"foo""bar"') == "foobar"
assert unescape('"Hello\\nWorld"') == "Hello\nWorld"
assert unescape('"Hello\\"World"') == 'Hello"World'
assert unescape('"fan\\\\cy"') == "fan\\cy"
def parse_po_data(raw_data: str) -> List[Translation]:
# None-ness indicates whether the start-tag has been seen already
line_first: Optional[int] = None
msgid_parts: Optional[List[str]] = None
msgstr_parts: Optional[List[str]] = None
translations: List[Translation] = []
i = -1
for i, line in enumerate(raw_data.split("\n")):
assert (line_first is None) == (msgid_parts is None), "line %d: inconsistent parser state" % (i + 1 - 1)
if not line or line.startswith("#"):
# Comment or empty line
continue
if msgid_parts is None:
assert line.startswith("msgid "), "line %d: expected beginning of msgid" % (i + 1)
if line.startswith("msgid "):
assert (msgid_parts is None) == (msgstr_parts is None), "line %d: start of new translation, but previous from line %s is not complete?!" % (i + 1, line_first)
if msgid_parts is not None:
assert msgstr_parts is not None # Otherwise, mypy does not understand that joining is okay.
translations.append(Translation(
"".join(msgid_parts),
"".join(msgstr_parts),
line_first,
))
line_first = i + 1
msgid_parts = []
msgstr_parts = None
line = line[len("msgid "): ]
assert line.startswith('"'), "line %d: msgid does not directly continue with string?!" % (i + 1)
if line.startswith("msgstr "):
assert line_first is not None
assert (msgid_parts is not None) and (msgstr_parts is None), "line %d: start of msgstr, but inconsistent state with line %d?!" % (i + 1, line_first)
msgstr_parts = []
line = line[len("msgstr "): ]
unescaped_line = unescape(line)
if msgstr_parts is None:
assert msgid_parts is not None
msgid_parts.append(unescaped_line)
else:
msgstr_parts.append(unescaped_line)
assert line_first is not None
assert (msgid_parts is None) == (msgstr_parts is None), "line %d: end of file, but translation from line %d is not complete?!" % (i + 1, line_first)
if msgid_parts is not None:
assert msgstr_parts is not None # Otherwise, mypy does not understand that joining is okay.
translations.append(Translation(
"".join(msgid_parts),
"".join(msgstr_parts),
line_first,
))
return translations
def lint_translations(translations: List[Translation], lint_printf: bool) -> List[Issue]:
issues: List[Issue] = []
for t in translations:
if not t.msgstr:
# Don't complain about missing translations
continue
cmdline_msgid = RE_CMDLINE_OPTION.findall(t.msgid)
cmdline_msgstr = RE_CMDLINE_OPTION.findall(t.msgstr)
if lint_printf:
printf_msgid = RE_PRINTFISH.findall(t.msgid)
printf_msgstr = RE_PRINTFISH.findall(t.msgstr)
else:
printf_msgid = []
printf_msgstr = []
# TODO: Try harder to pin-point the specific difference
if cmdline_msgid != cmdline_msgstr:
issues.append(Issue(
t,
f"mismatching mentions of command-line options: >>{cmdline_msgid}<< (in msgid) versus >>{cmdline_msgstr}<< (in msgstr)",
))
if printf_msgid != printf_msgstr:
issues.append(Issue(
t,
f"mismatching printf instructions: >>{printf_msgid}<< (in msgid) versus >>{printf_msgstr}<< (in msgstr)",
))
return issues
def run_on(openfile: Any, show_parsed_translations: bool, lint_printf: bool) -> None:
# TODO: openfile is a TextIOWrapper. How to 'type' this correctly?
raw_data: str = openfile.read()
translations: List[Translation] = parse_po_data(raw_data)
if show_parsed_translations:
for t in translations:
print(f"line {t.line_first}:")
print(f" {t.msgid}")
print(f" {t.msgstr}")
translation_issues: List[Issue] = lint_translations(translations, lint_printf)
for issue in translation_issues:
# TODO: Aggregate across files?
print(f"{openfile.name}:{issue.translation.line_first}: {issue.reason}")
print(f" msgid = {issue.translation.msgid}")
print(f" msgstr = {issue.translation.msgstr}")
def run(args: argparse.Namespace) -> None:
for openfile in args.filenames:
run_on(openfile, args.show_parsed_translations, args.lint_printf)
def make_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Checks for grave translation errors and typos in command-line options, and optionally also printf-style instructions."
)
parser.add_argument(
"filenames",
nargs="+",
type=argparse.FileType("r", encoding="utf8"),
help="Paths to the PO file(s)",
)
parser.add_argument(
"--show-parsed-translations",
action="store_true",
help="Show all translations (useful to debug the parser)",
)
parser.add_argument(
"--lint-printf",
action="store_true",
help="Also check printf-style instructions",
)
return parser
if __name__ == "__main__":
run(make_parser().parse_args())