From ea6e4c8c41c16b237b95553a4946f6e85a98a1f6 Mon Sep 17 00:00:00 2001 From: Jordan Anderson Date: Mon, 2 Sep 2019 22:06:23 -0600 Subject: [PATCH] Bring code closer to PEP 8 style --- __init__.py | 155 +++++---- cleaner.py | 960 +++++++++++++++++++++++++++------------------------- 2 files changed, 583 insertions(+), 532 deletions(-) diff --git a/__init__.py b/__init__.py index 67f6728..06a91f0 100644 --- a/__init__.py +++ b/__init__.py @@ -2,39 +2,46 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from __future__ import (unicode_literals, division, absolute_import, print_function, with_statement) +import time +import os +import mimetypes +import codecs +import sys +from functools import partial +from calibre_plugins.language_clean_plugin.cleaner import * +from calibre.ebooks.tweak import * +from optparse import OptionGroup, Option +from calibre.customize import FileTypePlugin logdir = "c:/Scratch/calibre" -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2012, Jordan Anderson' __docformat__ = 'restructuredtext en' #from __future__ import with_statement -import sys, os, time -from calibre.customize import FileTypePlugin -from optparse import OptionGroup, Option -from calibre.ebooks.tweak import * -from calibre_plugins.language_clean_plugin.cleaner import * -from functools import partial -import codecs, mimetypes + class CleanerPlugin(FileTypePlugin): - name = 'Language Cleaner' # Name of the plugin - description = 'Replace naughty or offensive language with something more acceptable (to me at least), recovered version' - supported_platforms = ['windows', 'osx', 'linux'] # Platforms this plugin will run on - author = 'Jordan Anderson' # The author of this plugin - version = (1, 5, 2017) # The version number of this plugin - file_types = set(['epub']) # The file types that this plugin will be applied to - on_preprocess = True # Run this plugin after conversion is complete + name = 'Language Cleaner' # Name of the plugin + description = ('Replace naughty or offensive language with something more ' + 'acceptable (to me at least), recovered version') + # Platforms this plugin will run on + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Jordan Anderson' # The author of this plugin + version = (2019, 9, 2) # The version number of this plugin + # The file types that this plugin will be applied to + file_types = set(['epub']) + on_preprocess = True # Run this plugin after conversion is complete minimum_calibre_version = (0, 7, 53) def run(self, path_to_ebook): #print ("*"*60,"\n","you are in Language Cleaner") #print ("*"*60,"\n") - ebook_file=path_to_ebook + ebook_file = path_to_ebook fmt = ebook_file.rpartition('.')[-1].lower() exploder, rebuilder = get_tools(fmt) - with TemporaryDirectory('_tweak_'+ - os.path.basename(ebook_file).rpartition('.')[0]) as tdir: + tmppath = '_tweak_' + os.path.basename(ebook_file).rpartition('.')[0] + with TemporaryDirectory(tmppath) as tdir: #prints ("Relevant info:",tdir,fmt,ebook_file) try: opf = exploder(ebook_file, tdir) @@ -45,68 +52,74 @@ def run(self, path_to_ebook): except Error as e: prints(as_unicode(e), file=sys.stderr) raise SystemExit(1) - #Debug - print ("Created tdir:",tdir,"and found opf",opf) + # Debug + print ("Created tdir:", tdir, "and found opf", opf) #print (os.popen("ll "+tdir).read()) #print ("OPF CONTENTS:") #print (open(opf,'r').read()) - #manipulate all of the files - opf = open(opf,'r').read().split('\n') + # manipulate all of the files + opf = open(opf, 'r').read().split('\n') # first, assemble the entire text to evaluate context - text="" + text = "" for f in walk(tdir): - opf_line = [ii for ii in opf if os.path.basename(f).lower() in ii.lower()] - ftype = mimetypes.guess_type(f)[0] - if not ftype and "html" in f.split('.')[-1]: - print('Non-text type %s for file %s but forcing text mode'%(ftype, f)) - ftype = 'text' - if not ftype: - print('Non-text type %s for file %s'%(ftype, f)) - elif opf_line and 'text' in ftype: - encodings = ['utf-8', 'windows-1252', 'windows-1250'] - for e in encodings: - try: - text += codecs.open(f,'r',encoding=e).read() - except UnicodeDecodeError: - print('File %s: got unicode error with %s , trying different encoding' % (f,e)) - else: - print('File %s: opening the file with encoding: %s ' % (f,e)) - break + opf_line = [ii for ii in opf if + os.path.basename(f).lower() in ii.lower()] + ftype = mimetypes.guess_type(f)[0] + if not ftype and "html" in f.split('.')[-1]: + print('Non-text type %s for file %s but forcing text mode' + % (ftype, f)) + ftype = 'text' + if not ftype: + print('Non-text type %s for file %s' % (ftype, f)) + elif opf_line and 'text' in ftype: + encodings = ['utf-8', 'windows-1252', 'windows-1250'] + for e in encodings: + try: + text += codecs.open(f, 'r', encoding=e).read() + except UnicodeDecodeError: + print('File %s: got unicode error with %s , trying different encoding' % (f, e)) + else: + print('File %s: opening the file with encoding: %s ' % (f, e)) + break replacement_list = language_check(text) - start_text=text - end_text="" - #Now do replacements on each file + start_text = text + end_text = "" + # Now do replacements on each file for f in walk(tdir): - opf_line = [ii for ii in opf if os.path.basename(f).lower() in ii.lower()] - #Not sure what the correct way to determine which files should - # be edited. Seems like most are marked 'application/' in type - print ("File",f,"\nOPF line:\n",opf_line) - ftype = mimetypes.guess_type(f)[0] - if not ftype and "html" in f.split('.')[-1]: - print('Non-text type %s for file %s but forcing text mode'%(ftype, f)) - ftype = 'text' - if not ftype: - print('Non-text type %s for file %s'%(ftype, f)) - elif opf_line and 'text' in ftype: - print ("Cleaning",f) - text = open(f,'r').read() - output = "" - for line in text.split("\n"): - #Go through all elements of replacement_list - for search,sub,pcase in replacement_list: - if pcase: # Preserve case - line = search.sub(partial(pcase,sub),line) - else: # Don't preserve case - line = search.sub(sub,line) - output += line + "\n" - open(f,'w').write(output) - end_text += output - if start_text.replace('\n',"") == end_text.replace('\n',''): + opf_line = [ii for ii in opf if + os.path.basename(f).lower() in ii.lower()] + # Not sure what the correct way to determine which files should + # be edited. Seems like most are marked 'application/' in type + print ("File", f, "\nOPF line:\n", opf_line) + ftype = mimetypes.guess_type(f)[0] + if not ftype and "html" in f.split('.')[-1]: + print('Non-text type %s for file %s but forcing text mode' + % (ftype, f)) + ftype = 'text' + if not ftype: + print('Non-text type %s for file %s' % (ftype, f)) + elif opf_line and 'text' in ftype: + print ("Cleaning", f) + text = open(f, 'r').read() + output = "" + for line in text.split("\n"): + # Go through all elements of replacement_list + for search, sub, pcase in replacement_list: + if pcase: # Preserve case + line = search.sub(partial(pcase, sub), line) + else: # Don't preserve case + line = search.sub(sub, line) + output += line + "\n" + open(f, 'w').write(output) + end_text += output + if start_text.replace('\n', "") == end_text.replace('\n', ''): print ("Language cleaner made no changes") else: if os.path.exists(logdir): - open(logdir+os.sep+'%s_init.txt'%(os.path.basename(ebook_file)+str(time.time())),'w').write(start_text) - open(logdir+os.sep+'%s_mod.txt'%(os.path.basename(ebook_file)+str(time.time())),'w').write(end_text) + open(logdir+os.sep+'%s_init.txt' % + (os.path.basename(ebook_file)+str(time.time())), 'w').write(start_text) + open(logdir+os.sep+'%s_mod.txt' % + (os.path.basename(ebook_file)+str(time.time())), 'w').write(end_text) prints('Rebuilding', ebook_file, 'please wait ...') try: rebuilder(tdir, ebook_file) @@ -119,5 +132,3 @@ def run(self, path_to_ebook): #print (path_to_ebook,ext,str(mi)) #print ("you are returning from Language Cleaner") return ebook_file - - diff --git a/cleaner.py b/cleaner.py index 6cb9e52..b2e5e5a 100644 --- a/cleaner.py +++ b/cleaner.py @@ -1,472 +1,512 @@ #!/usr/bin/env python # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai -import os,sys,re +import os +import sys +import re -#Substitute requested word matching case of matched word -def keep_case(sub,matchobj): - val = matchobj.group(0) - up_count = 0 - if val.isupper(): - sub = sub.upper() - else: - #Test first two to see if all uppercase - for ii in range(min(2,len(sub),len(val))): - if val[ii].isupper(): - up_count += 1 - sub = sub[:ii] + sub[ii].upper() + sub[ii+1:] - #Allow further uppercase only if all uppercase - for ii in range(min(len(sub),len(val))): - if up_count > 1: - up_count += 1 - sub = sub[:ii] + sub[ii].upper() + sub[ii+1:] - return sub -def first_case(sub,matchobj): - val = matchobj.group(0) - if val.isupper(): - sub = sub.upper() - else: - try: - for ii in range(1): - if val[ii].isupper(): - sub = sub[:ii] + sub[ii].upper() + sub[ii+1:] - except: - print "*"*60,"sub=",sub,"val=",val,"*"*60 - return sub +def keep_case(sub, matchobj): + ''' Substitute requested word matching case of matched word ''' + val = matchobj.group(0) + up_count = 0 + if val.isupper(): + sub = sub.upper() + else: + # Test first two to see if all uppercase + for ii in range(min(2, len(sub), len(val))): + if val[ii].isupper(): + up_count += 1 + sub = sub[:ii] + sub[ii].upper() + sub[ii+1:] + # Allow further uppercase only if all uppercase + for ii in range(min(len(sub), len(val))): + if up_count > 1: + up_count += 1 + sub = sub[:ii] + sub[ii].upper() + sub[ii+1:] + return sub + + +def first_case(sub, matchobj): + ''' Keep the case of the first lettter ''' + val = matchobj.group(0) + if val.isupper(): + sub = sub.upper() + else: + try: + for ii in range(1): + if val[ii].isupper(): + sub = sub[:ii] + sub[ii].upper() + sub[ii+1:] + except: + print "*"*60, "sub=", sub, "val=", val, "*"*60 + return sub + + +def drop_first_match(sub, matchobj): + ''' Drop first match, match case of first and return second ''' + drop = matchobj.group(1) + val = matchobj.group(2) + try: + for ii in range(len(drop)): # find first alpha in drop + if drop[ii].isalpha(): + if drop[ii].isupper(): # uppercase, so copy to val + for jj in range(len(val)): # find first alpha in val + if val[jj].isalpha(): + val = val[:jj] + val[jj].upper() + val[jj+1:] + break + break + except: + print "*"*50, "error in drop_first_match" + print drop + print val + print str(sub) + print str(matchobj.groups()) + return val -#Drop first match, match case of first and return second -def drop_first_match(sub,matchobj): - drop = matchobj.group(1) - val = matchobj.group(2) - try: - for ii in range(len(drop)): #find first alpha in drop - if drop[ii].isalpha() : - if drop[ii].isupper(): #uppercase, so copy to val - for jj in range(len(val)): #find first alpha in val - if val[jj].isalpha(): - val = val[:jj] + val[jj].upper() + val[jj+1:] - break - break - except: - print "*"*50, "error in drop_first_match" - print drop - print val - print str(sub) - print str(matchobj.groups()) - return val # Prepare two lists for different meanings of ass -dirty_a_list=[ - ######################################### - # dirtier ass - ######################################### - #haul ass - (re.compile(r'\b(move|haul)\Wass\b',re.I),"move fast",keep_case), - #little ass - (re.compile(r'little\W?ass\b',re.I),"little donkey",keep_case), - (re.compile(r'little\W?asses\b',re.I),"little donkeys",keep_case), - #your/own/etc. ass - (re.compile(r'(?<=(.your|..own|...my|..our|..her|..his|.this|.that|..the|their|those|these|..its|..for)\W)ass\b',re.I),"rear",keep_case), - (re.compile(r'(?<=(.your|..own|...my|..our|..her|..his|.this|.that|..the|their|those|these|..its|..for)\W)asses\b',re.I),"rears",keep_case), - #asses - (re.compile(r'\basses\b',re.I),"rears",keep_case), - #ass - (re.compile(r'\ban\Wass\b',re.I),"a jerk",keep_case), - (re.compile(r'\bass\b',re.I),"rear",keep_case), - ] +dirty_a_list = [ + ######################################### + # dirtier ass + ######################################### + # haul ass + (re.compile(r'\b(move|haul)\Wass\b', re.I), "move fast", keep_case), + # little ass + (re.compile(r'little\W?ass\b', re.I), "little donkey", keep_case), + (re.compile(r'little\W?asses\b', re.I), "little donkeys", keep_case), + #your/own/etc. ass + (re.compile(r'(?<=(.your|..own|...my|..our|..her|..his|.this|.that|..the|their|those|these|..its|..for)\W)ass\b', re.I), "rear", keep_case), + (re.compile(r'(?<=(.your|..own|...my|..our|..her|..his|.this|.that|..the|their|those|these|..its|..for)\W)asses\b', re.I), "rears", keep_case), + # asses + (re.compile(r'\basses\b', re.I), "rears", keep_case), + # ass + (re.compile(r'\ban\Wass\b', re.I), "a jerk", keep_case), + (re.compile(r'\bass\b', re.I), "rear", keep_case), +] -clean_a_list=[ - ######################################### - # cleaner ass - ######################################### - #haul ass - (re.compile(r'\bhaul\Wass\b',re.I),"move fast",keep_case), - #asses - (re.compile(r'\basses\b',re.I),"donkeys",keep_case), - #ass - (re.compile(r'\ban Ass\b'),"a Donkey",False), #C.S. Lewis - (re.compile(r'\ban\Wass\b',re.I),"a donkey",keep_case), - (re.compile(r'(? worth a cent (couldn't think of a better word) - (re.compile(r'((matters?|worth|of)\W+a\W+)(?:gods? *)?damn\b',re.I),r'\1cent',False), - #of the damned - (re.compile(r'(of\W*the\W*)(?:gods? *)?damned\b',re.I),r'\1cursed',False), - #Your damned word, a damn word, etc - (re.compile(r'(your|our|her|his|this|that|the|their|hose|these|for|so|some|one|one more|too)( +)(?:gods? *)?damn(?:ed)?\b(?!-)',re.I),r'\1',False), - #a damn - (re.compile(r'(?<=\b[aA] )(?:gods? *)?damn(?:ed)',re.I),'darn',keep_case), - #damned good, damn sure, etc (Clancy's favorites) - (re.compile(r'\b((?:gods? *)?damn(?:ed))(?:\W+)(sure|near|sight|good|much|hard|easy|big|little|glad|clever|mess|smart|fine|fool|right|thing|much|shame|nice|mean|bad|lucky|late|important)',re.I),'',drop_first_match), - (re.compile(r'\b((?:gods? *)?damn(?:ed))(?:\W+)well',re.I),'darn well',keep_case), - #Religious damning - (re.compile(r'\b(?:gods? *)?damned',re.I),'cursed',keep_case), - (re.compile(r'\b(?:gods? *)?damndest',re.I),'very best',keep_case), - (re.compile(r'\b(?:gods? *)?damning',re.I),'condemning',keep_case), - (re.compile(r'\b(?:gods? *)?damnable',re.I),'condemning',keep_case), - (re.compile(r'\b(?:gods? *)?damnably',re.I),'cursedly',keep_case), - (re.compile(r'\b(?:gods? *)?damnatory',re.I),'condemning',keep_case), - (re.compile(r'\b(?:gods? *)?damnation',re.I),'condemnation',keep_case), - #damn it - (re.compile(r', (?:gods? *)?damn it(?: all)?',re.I),'',keep_case), - (re.compile(r'((?:gods? *)?damn it(?: all)?, +)(.)',re.I),'',drop_first_match), - #a damn something, like "a damn nuisance" - (re.compile(r'\ba(\W+)(?:gods? *)?damn',re.I),r'a\1blasted',False), - #damn you/his/her/etc - (re.compile(r'\b(?:gods? *)?damn you to hell',re.I),'curse you',keep_case), - (re.compile(r'\b(?:gods? *)?damn(?= (him|his|her|you|next|the|you))',re.I),'curse',keep_case), - #Word by itself - (re.compile(r'\b(?:gods? *)?damn\b',re.I),'dang',keep_case), - #Final catch-all - (re.compile(r'(?:gods? *)?damn',re.I),'dang',keep_case), - ######################################### - # Bitch - ######################################### - #Son of a bitch - (re.compile(r's[UuOo]n(s)?([ -])?[OoUu][FfVv]([ -])?(a)?([ -])?bitch(e)?',re.I),'jerk',keep_case), - #verb - (re.compile(r'bitchin[^\s]',re.U+re.I),'complaining',keep_case), - (re.compile(r'bitched',re.I),'complained',keep_case), - (re.compile(r'bitche?(?=s? abo)',re.I),'complain',keep_case), - (re.compile(r'(?<=(n([^\s]|o)t ))bitch',re.U+re.I),'complain',keep_case), - #A bitch - (re.compile(r's a bitch',re.I),'s tough',keep_case), - #Bitch by itself - (re.compile(r'\bbitch(e)?',re.I),'jerk',keep_case), - ######################################### - # Shit - ######################################### - #bullshit - (re.compile(r'\b(bull|horse|dog|jack)(.)?shit',re.I),'shit',keep_case), - #Holy shit - (re.compile(r'\bholy\W*shit',re.I),'incredible',keep_case), - #exclamantion - (re.compile(r'(?<=oh, )shit\b',re.I),'shoot',keep_case), - (re.compile(r'(?<=oh )shit\b',re.I),'shoot',keep_case), - (re.compile(r'(?hell< shows up in html with italics or emphasis - (re.compile(r'\>hell\<',re.U+re.I),'>perdition<',keep_case), + ######################################### + # Random stuff + ######################################### + # Remove suggestive 'tits' with not suggestive belly + # don't do 'tit for tat', tit-tat-toe, or split tit-ular + (re.compile(r'\b[tT][iI][tT][sS]?\b(?! for)(?!-tat)(?!-ul)', + re.I), 'belly', keep_case), + # Slut is rude, replace with slightly better hussy + (re.compile(r'\bslut\b', re.I), 'hussy', keep_case), + (re.compile(r'\bsluts\b', re.I), 'hussies', keep_case), + # Change topless bar to just bar + (re.compile(r'topless\Wbar', re.I), 'bar', keep_case), + # Replace whore with woman (not always a good replacement) + # (re.compile(r'\bwhore\b',re.I),'woman',keep_case), + # (re.compile(r'\bwhores\b',re.I),'women',keep_case), + # Whorehouse becomes brothel + (re.compile(r'whorehouse', re.I), 'brothel', keep_case), + # Crap and crapper to 'use the toilet' + (re.compile(r'take\Wa\Wcrap(per)?', re.I), 'use the toilet', keep_case), + (re.compile(r'\bcrapper', re.I), 'toilet', keep_case), + # Crap and crapper to garbage + (re.compile(r'\bcrap\b', re.I), 'garbage', keep_case), + (re.compile(r'\bcrapped\b', re.I), 'wet', keep_case), + # Cock-up with mess-up + (re.compile(r'\bcock.?up\b', re.I), "mess up", keep_case), + # Cocksucker with sucker + (re.compile(r'\bcock.?(?=suc)', re.I), "", False), + # Cocker with idiot (but not cocker spaniel + (re.compile(r'\bcocker\b(?![ -]spani)', re.I), "idiot", keep_case), + # Cunt + (re.compile(r'\bcunt\b', re.I), 'groin', keep_case), + # Replace goddammit and dammit with 'dang it' + (re.compile(r'([^\.?!] *) Goddam([mn])', re.I), r'\1 goddam\2', False), + (re.compile(r'(?:gods?)?dammit', re.I), 'dang it', keep_case), + ######################################### + # Replace ass and its varieties (see specific lists above, dirty_a_list and clean_a_list) + ######################################### + # smart ass + (re.compile(r'smart\W?ass\b', re.I), "smart aleck", keep_case), + (re.compile(r'smart\W?asses\b', re.I), "smart alecks", keep_case), + # kiss ass + (re.compile(r'kissin[^\s]\Wass(es)?\b', + re.U+re.I), "kissing up", keep_case), + (re.compile(r'kiss.{1,6}ass(es)?\b', re.I), "fly a kite", keep_case), + # kick ass + (re.compile(r'kick\W?ass\b', re.I), "kick booty", keep_case), + (re.compile(r'kick\W?asses\b', re.I), "kick booties", keep_case), + # cover ... ass + (re.compile(r'(cover.{0,8} )ass\b', re.I), r"\1rear", False), + (re.compile(r'(cover.{0,8} )asses\b', re.I), r"\1rears", False), + # kick ... ass + (re.compile(r'(kick.{0,8} )ass\b', re.I), r"\1rear", False), + (re.compile(r'(kick.{0,8} )ass\b', re.I), r"\1rears", False), + # assed + (re.compile(r'\bassed\b', re.I), "ended", keep_case), + # jack/dumbass + (re.compile(r'(?<=bray like a )(jack|dumb)ass\b', re.I), "donkey", keep_case), + (re.compile(r'(jack|dumb)ass\b', re.I), "jerk", keep_case), + (re.compile(r'(jack|dumb)asses\b', re.I), "jerks", keep_case), + # asshole + (re.compile(r'an\Wasshole', re.I), "a jerk", keep_case), + (re.compile(r'asshole', re.I), "jerk", keep_case), + # horse's ass + (re.compile(r'horse[^\s]?s ?ass\b', re.U+re.I), "jerk", keep_case), + (re.compile(r'horse[^\s]?s ?asses\b', re.U+re.I), "jerks", keep_case), + ######################################### + # Replace damn and its varieties + ######################################### + # I'll be damned + (re.compile(r'be(\W+)(?:gods? *)?damned', re.I), r'be\1darned', False), + # Give a damn + (re.compile(r'give(\W+.{0,10}?)a(\W+)(?:gods? *)?damn', + re.I), 'care', keep_case), + (re.compile( + r'gives(\W+.{0,10}?)a(\W+)(?:gods? *)?damn', re.I), 'cares', keep_case), + # Damn near + (re.compile(r'(?:gods? *)?damn(\W+)near', re.I), 'nearly', keep_case), + # a damn. Worth a damn -> worth a cent (couldn't think of a better word) + (re.compile(r'((matters?|worth|of)\W+a\W+)(?:gods? *)?damn\b', re.I), r'\1cent', False), + # of the damned + (re.compile(r'(of\W*the\W*)(?:gods? *)?damned\b', re.I), r'\1cursed', False), + # Your damned word, a damn word, etc + (re.compile(r'(your|our|her|his|this|that|the|their|hose|these|for|so|some|one|one more|too)( +)(?:gods? *)?damn(?:ed)?\b(?!-)', re.I), r'\1', False), + # a damn + (re.compile(r'(?<=\b[aA] )(?:gods? *)?damn(?:ed)', + re.I), 'darn', keep_case), + # damned good, damn sure, etc (Clancy's favorites) + (re.compile(r'\b((?:gods? *)?damn(?:ed))(?:\W+)(sure|near|sight|good|much|hard|easy|big|little|glad|clever|mess|smart|fine|fool|right|thing|much|shame|nice|mean|bad|lucky|late|important)', re.I), '', drop_first_match), + (re.compile(r'\b((?:gods? *)?damn(?:ed))(?:\W+)well', re.I), 'darn well', keep_case), + # Religious damning + (re.compile(r'\b(?:gods? *)?damned', re.I), 'cursed', keep_case), + (re.compile(r'\b(?:gods? *)?damndest', re.I), 'very best', keep_case), + (re.compile(r'\b(?:gods? *)?damning', re.I), 'condemning', keep_case), + (re.compile(r'\b(?:gods? *)?damnable', re.I), 'condemning', keep_case), + (re.compile(r'\b(?:gods? *)?damnably', re.I), 'cursedly', keep_case), + (re.compile(r'\b(?:gods? *)?damnatory', re.I), 'condemning', keep_case), + (re.compile(r'\b(?:gods? *)?damnation', re.I), 'condemnation', keep_case), + # damn it + (re.compile(r', (?:gods? *)?damn it(?: all)?', re.I), '', keep_case), + (re.compile(r'((?:gods? *)?damn it(?: all)?, +)(.)', re.I), '', drop_first_match), + # a damn something, like "a damn nuisance" + (re.compile(r'\ba(\W+)(?:gods? *)?damn', re.I), r'a\1blasted', False), + # damn you/his/her/etc + (re.compile(r'\b(?:gods? *)?damn you to hell', re.I), 'curse you', keep_case), + (re.compile(r'\b(?:gods? *)?damn(?= (him|his|her|you|next|the|you))', re.I), + 'curse', keep_case), + # Word by itself + (re.compile(r'\b(?:gods? *)?damn\b', re.I), 'dang', keep_case), + # Final catch-all + (re.compile(r'(?:gods? *)?damn', re.I), 'dang', keep_case), + ######################################### + # Bitch + ######################################### + # Son of a bitch + (re.compile(r's[UuOo]n(s)?([ -])?[OoUu][FfVv]([ -])?(a)?([ -])?bitch(e)?', + re.I), 'jerk', keep_case), + # verb + (re.compile(r'bitchin[^\s]', re.U+re.I), 'complaining', keep_case), + (re.compile(r'bitched', re.I), 'complained', keep_case), + (re.compile(r'bitche?(?=s? abo)', re.I), 'complain', keep_case), + (re.compile(r'(?<=(n([^\s]|o)t ))bitch', + re.U+re.I), 'complain', keep_case), + # A bitch + (re.compile(r's a bitch', re.I), 's tough', keep_case), + # Bitch by itself + (re.compile(r'\bbitch(e)?', re.I), 'jerk', keep_case), + ######################################### + # Shit + ######################################### + # bullshit + (re.compile(r'\b(bull|horse|dog|jack)(.)?shit', re.I), 'shit', keep_case), + # Holy shit + (re.compile(r'\bholy\W*shit', re.I), 'incredible', keep_case), + # exclamantion + (re.compile(r'(?<=oh, )shit\b', re.I), 'shoot', keep_case), + (re.compile(r'(?<=oh )shit\b', re.I), 'shoot', keep_case), + (re.compile(r'(?hell< shows up in html with italics or emphasis + (re.compile(r'\>hell\<', re.U+re.I), '>perdition<', keep_case), ] #+ ass_list + lord_list DEBUG = True + + def language_check(text): - ret_val = re_list + lord_list - # Determine if this book is likely to take Lord's name in vain - if re.search("(for Christ's sake!|Holy Christ!|Holy Jesus!|for God's sake!|God almighty!|goddamn|fuck)",text,re.I): - if DEBUG: - print "Looks like book uses Lord's name in vain" - ret_val += vain_lord_list - else: - if DEBUG: - print "Looks like book does not use Lord's name in vain" - #Ass has two very different contexts. Guess which to use. - if re.search("(dumbass|asshole|smart ass|kick ass|ass kick|ass handed|badass|cover.{0,5}ass)",text): - ret_val += dirty_a_list - if DEBUG: - print "Looks like book does not need the donkey treatment" - else: - ret_val += clean_a_list - if DEBUG: - print "Looks like book calls donkeys asses" - #open('/tmp/dump.txt','w').write(text) - return ret_val + ret_val = re_list + lord_list + # Determine if this book is likely to take Lord's name in vain + if re.search("(for Christ's sake!|Holy Christ!|Holy Jesus!|for God's sake!|God almighty!|goddamn|fuck)", text, re.I): + if DEBUG: + print "Looks like book uses Lord's name in vain" + ret_val += vain_lord_list + else: + if DEBUG: + print "Looks like book does not use Lord's name in vain" + # Ass has two very different contexts. Guess which to use. + if re.search("(dumbass|asshole|smart ass|kick ass|ass kick|ass handed|badass|cover.{0,5}ass)", text): + ret_val += dirty_a_list + if DEBUG: + print "Looks like book does not need the donkey treatment" + else: + ret_val += clean_a_list + if DEBUG: + print "Looks like book calls donkeys asses" + # open('/tmp/dump.txt','w').write(text) + return ret_val + ''' from functools import partial @@ -474,25 +514,25 @@ def language_check(text): text = codecs.open('bad.txt', encoding='utf-8').read() #if DEBUG: -# print text -# print "-"*40 -# print "-"*40 +# print text +# print "-"*40 +# print "-"*40 output = "" replacement_list = language_check(text) output = "" for line in text.split("\n"): - #Go through all elements of replacement_list - for search,sub,pcase in replacement_list: - if pcase: # Preserve case - line = search.sub(partial(pcase,sub),line) - else: # Don't preserve case - line = search.sub(sub,line) - output += line + "\n" + #Go through all elements of replacement_list + for search,sub,pcase in replacement_list: + if pcase: # Preserve case + line = search.sub(partial(pcase,sub),line) + else: # Don't preserve case + line = search.sub(sub,line) + output += line + "\n" #if DEBUG: -# print output +# print output codecs.open('clensed.txt','w', encoding='utf-8').write(output) '''