This repository has been archived by the owner on Sep 19, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpagetranslate.py
executable file
·448 lines (374 loc) · 14.1 KB
/
pagetranslate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
#!/usr/bin/env python
"""
Provides translatePage and other functions for generating word lists from
input strings.
The function 'translatePage' comes configured for Wikipedia and has been tested
with Wikipedia pages. The translatePage method has been set for wikipedia pages
in particular, but uses parameters so it can be applied to a page with a
header (h1) and body div id; however this has not been extensively tested.
Also turns relative html paths to absolute paths for links, css, and images
so generated page looks like the source page (for Wikipedia at least).
Currently does not translate/gibberize the following cases:
- Words with accented letters
(Future: convert to unaccented equivalents)
- Words with mixed letters/numbers (i.e. acronyms)
(Future: gibberize individual letters of the mixing)
- Numerics e.g. dates
(Future: Leave un-gibberized)
Other future plans:
- Allow percentages of words to be left un-gibberized
(Future: Leave shortest words first e.g. a, the, and, etc.)
- Expand to detect suffixes/prefixes so root words are gibberized consistently
"""
# Python library imports
from beautifulsoup4 import BeautifulSoup, SoupStrainer
import logging
import os
import random
import re
import string
import urllib2
# Local imports
import wordgen
__author__ = "Hillary Jeffrey"
__copyright__ = "Copyright 2017"
__credits__ = ["Hillary Jeffrey"]
__license__ = "GPL"
__version__ = "1.0"
__maintainer__ = "Hillary Jeffrey"
__email__ = "hillaryaj@gmail.com"
__status__ = "Development"
DEFAULT_TEST_PAGE = 'https://en.wikipedia.org/wiki/Gladiola'
DEFAULT_OUTPUT_PATH = os.getcwd()
DEFAULT_OUTFILE_NAME = "test.html"
WIKI_TITLE_ID = "firstHeading"
WIKI_BODY_ID = "mw-content-text"
LOWER = "lower"
TITLE = "title"
UPPER = "upper"
NUMERIC = "num"
LETTER = "letter"
ALPHA = "alpha"
CAP_TYPES = {
LOWER: string.lower, # A standard, lower-case word
TITLE: string.capitalize, # Initial capital - a title-case word
UPPER: string.upper, # All-caps - an uppercase word
NUMERIC: string.upper, # A number or word that contains numbers
}
# Set up regular expression for removing punctuation for parsing words
punctregex = re.compile('[%s]' % re.escape(string.punctuation))
# Set up regular expressions for determining word types/composition
numregex = re.compile('[0-9]')
capregex = re.compile('[A-Z]')
vowelregex = re.compile('[aoeui]')
# Set up logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def translatePage(
pageurl=DEFAULT_TEST_PAGE,
outputpath=DEFAULT_OUTPUT_PATH,
outfile=DEFAULT_OUTFILE_NAME,
titleid=WIKI_TITLE_ID,
bodyid=WIKI_BODY_ID,
percent=100):
"""
Loads a given URL and scrambles the page's header and bodytext contents.
Defaults are set for Wikipedia pages.
Keywords:
pageurl - URL for html page to load
outputpath - Output file directory path (default: current directory)
outfile - Output file name (should end in ".html" for easy browser display)
titleid - html tag id for header text
bodyid - html tag id for body text
percent - [UNIMPLEMENTED] Percentage of words (0-100) to gibberize
Returns the output file path
"""
# Initialize variables for scope
pagetitle = ""
bodytext = ""
wordlist = {}
# Load the page
htmlobj = urllib2.urlopen(pageurl).read()
titleStrainer = SoupStrainer(id=titleid)
bodyStrainer = SoupStrainer(id=bodyid)
# This is a hack to find baseurl for filling for absolute paths
# DANGER: Requires properly-formatted url
# DANGER: Performs no error checking for proper url
baseurl = '/'.join(pageurl.split('/')[:3])
# Make all links absolute to passed url so pages load css/etc properly
# NOTE: Order is important so the // are done before the /
htmlobj = re.sub('href="//', 'href="http://', htmlobj)
htmlobj = re.sub('href="/', 'href="%s/' % baseurl, htmlobj)
# Make image paths absolute so images load properly
htmlobj = re.sub('src="//', 'src="http://', htmlobj)
htmlobj = re.sub('srcset="//', 'srcset="http://', htmlobj)
soup = BeautifulSoup(htmlobj, 'html.parser')
# Find the "firstHeading" h1 so we can get the title as a random seed
# h1 = BeautifulSoup(htmlobj, 'html.parser', parse_only=titleStrainer)
h1 = soup.find_all(titleStrainer)[0]
# Perform error checking
if len(h1.contents) > 0:
pagetitle = str(h1.get_text())
else:
raise ValueError("Header contents length error: len %i"
% (len(h1.contents)))
# COOL PART: Set the random seed to the page title so the
# article 'translation' will be reproducible
random.seed(pagetitle)
# Now get the body content
div = BeautifulSoup(htmlobj, 'html.parser', parse_only=bodyStrainer)
# Get the body text
try:
bodytext = div.get_text().encode('utf-8')
# bodytext = soup.div.get_text().encode('utf-8')
except Exception, e:
logging.error(
"Unexpected error: {}\nDiv contained {} elements.".format(
str(e),
len(div.contents)
)
)
raise e
# Now process the text into words - must include page title
words = findUniqueWords(pagetitle + " " + bodytext)
# Associate each word with the word type and number of syllables
# Use a dictionary for future expansion of keywords (otherwise tuple)
for word in words:
wordkey = word.lower()
wordtype = findWordType(word)
syll = countSyllables(word)
# FUTURE: Check for word roots (i.e. prefixes/suffixes)
# FUTURE: If not using other keys outside 'rep' generation
# then take out the rest of the dict
wordlist[wordkey] = {'type': wordtype,
'syll': syll,
}
# Generate new words/letters based on word types
if wordtype is NUMERIC:
# Don't currently replace numerics
wordlist[wordkey]['rep'] = word
else:
if wordtype is LETTER:
if syll == 0:
# Not a vowel? Replace with random consonant
wordlist[wordkey]['rep'] = wordgen.cons(syll)
else:
# Replace single vowel with a single random vowel
wordlist[wordkey]['rep'] = wordgen.vowel(syll)
else:
# Normal word; generate a new one
wordlist[wordkey]['rep'] = wordgen.word(syll)
# Replace all the instances of the words in the title and body
# Handles multi-word titles and punctuation like parentheses
h1.string.replace_with(re.sub('(\w+)',
lambda x: regexWordMatch(x, wordlist),
unicode(h1.string)))
# Parse through the body tag text and replace display text
# Handles all children as well as punctuation
# Words are replaced but maintain original capitalization
text = [tagtxt for tagtxt in div.strings]
text.reverse()
for tagtxt in text:
tagtxt.replace_with(re.sub('(\w+)',
lambda x: regexWordMatch(x, wordlist),
unicode(tagtxt)))
# Update the html with the gibberized body text
soup.find(bodyStrainer).replace_with(div)
# Make sure the output path exists - rudimentary path compliance
abspath = os.path.abspath(outputpath)
outputfile = os.path.join(abspath, outfile)
# If the path does not exist, create it
if not os.path.exists(abspath):
os.makedirs(abspath)
# Overwrite the file if it exists without asking
with open(outputfile, 'w') as f:
f.write(soup.encode())
f.close()
# Return the output file path
return outputfile
def regexWordMatch(match, wordlist):
"""Returns the replacement text for a given match from a wordlist
dict. Restores capitalization based on the matched text.
Keywords:
match - Match object from regex
wordlist - Dict containing old/new word pairs
Returns the gibberized word if it appears in the wordlist
"""
matchtxt = str(match.group(0))
matchkey = matchtxt.lower()
matchcase = findCapsType(matchtxt)
if matchkey in wordlist:
logging.debug("Matched word '{}'".format(matchkey))
return CAP_TYPES[matchcase](wordlist[matchkey]['rep'])
else:
logging.debug("No match found for '{}'".format(matchkey))
return matchtxt
def stripPunctAndSplit(inputstring):
"""Strips punctuation from an input string and splits it into
individual words. Returns an array of words.
"""
return punctregex.sub(' ', inputstring).split()
def findUniqueWords(bodycontents, sortlist=True):
"""Takes an string input and compiles a list of unique words
Keywords:
bodycontents - string input
sortlist - T/F whether to sort word list by length (ascending)
Returns an array of unique words.
"""
# Remove punctuation and all whitespace by turning contents
# into an array of words
words = stripPunctAndSplit(bodycontents)
# Remove duplicate words
wordlist = list(set(words))
# If desired, sort the returning wordlist by length
# This is for future expansion to only gibberize partial articles
# because gibberizing should leave the most common short words
# (e.g. a, and, the, etc.) and gibberize the longer ones
if sortlist:
wordlist.sort(key=len)
# TODO: Search for suffixes such as -ing -s -ed -ly so root words
# remain related after gibberizing
return wordlist
def findWordType(word):
"""Finds whether the given word is a normal word (alpha), numeric,
or single letters"""
# Find the number of numeric characters in the word
nums = len(re.findall(numregex, word))
if nums > 0:
return NUMERIC
elif len(word) == 1:
return LETTER
else:
return ALPHA
def findCapsType(word):
"""Determine given word's capitalization type"""
# Find the number of capital letters in the word
caps = len(re.findall(capregex, word))
nums = len(re.findall(numregex, word))
if caps == len(word):
return UPPER
elif caps == 1:
# TODO: Verify it's the first letter that's capitalized
# return WORD_TYPES.index(TITLE)
return TITLE
elif caps + nums == len(word):
# This is a mixed numeric with all-caps letters in it
return UPPER
else:
return LOWER
def countSyllables(word):
"""Returns the number of syllables in a word.
As a simplification, uses the number of vowels to approximate
syllable count."""
# As a simplification, let's count the number of vowels
# TODO: Figure out some rough syllable count
return countVowels(word)
def countVowels(word):
"""Returns the number of vowels in the word"""
numvowels = len(re.findall(vowelregex, word))
if numvowels > 2:
numvowels = int(numvowels) / 2
return numvowels
def alterString(inputstring, wordlist):
"""Replaces words in the input string with their replacements
in the given wordlist dict. Any word not in the wordlist keys
will be skipped.
This functionality has been superseded in main program flow
by regex but is kept as a utility.
Keywords:
inputstring - input string to substitute words inputstring
wordlist - Dict containing old/new word information
"""
newstr = inputstring.split(" ")
for kk in range(len(newstr)):
words = punctregex.sub(' ', newstr[kk].strip()).split()
logging.debug("Word list:\n{}".format(repr(words)))
for word in words:
if word in wordlist:
rep = wordlist[word]['rep']
newstr[kk] = string.replace(newstr[kk], word, rep)
return " ".join(newstr)
if __name__ == "__main__":
# FUTURE: Handle or prompt for keyword arguments
import argparse
parser = argparse.ArgumentParser(
description='Takes a page and turns all its words into ' +
'internally-consistent gibberish.'
)
parser.add_argument(
'-url',
'--page-url',
default=DEFAULT_TEST_PAGE,
type=str,
dest='pageurl',
help='URL of page to be gibberized ' +
'(Default: https://en.wikipedia.org/wiki/Gladiola)'
)
parser.add_argument(
'-path',
'--output-path',
default=DEFAULT_OUTPUT_PATH,
type=str,
dest='outputpath',
help='Output path (Default: current directory)'
)
parser.add_argument(
'-file',
'--output-file',
default=DEFAULT_OUTFILE_NAME,
type=str,
dest='outputfile',
help='Output filename (Default: "test.html")'
)
# FUTURE IDEA: Support a few known sites and select title/body pair
parser.add_argument(
'-titleid',
default=WIKI_TITLE_ID,
type=str,
dest='titleid',
help='Title HTML element ID to gibberize ' +
'(Default: Wikipedia: "firstHeading")'
)
parser.add_argument(
'-bodyid',
default=WIKI_BODY_ID,
type=str,
dest='bodyid',
help='Body HTML element ID to gibberize ' +
'(Default: Wikipedia: "mw-content-text")'
)
parser.add_argument(
'-pct',
'--percent-gibberize',
default=100,
type=int,
dest='percent',
help='(FUTURE)Percentage of words to change. (Default: 100)'
)
parser.add_argument(
'--convert-numbers',
action='store_true',
dest='convert_numbers',
help='(FUTURE)Set to gibberize numbers. (Default: False)'
)
parser.add_argument(
'--convert-dates',
action='store_true',
dest='convert_dates',
help='(FUTURE)Set to gibberize dates. (Default: False)'
)
args = parser.parse_args()
# Perform gibberizing
logging.debug(
"Translating page with specified arguments:\n{}".format(args)
)
outfile = translatePage(
pageurl=args.pageurl,
outputpath=args.outputpath,
outfile=args.outputfile,
titleid=args.titleid,
bodyid=args.bodyid,
# percent=args.percent
percent=100
)
logging.info("Page scrambled! Output is at:\n{}".format(outfile))