diff --git a/Language_Cleaner.zip b/Language_Cleaner.zip new file mode 100644 index 0000000..cfedccf Binary files /dev/null and b/Language_Cleaner.zip differ diff --git a/README.md b/README.md index 25c56db..3ecf955 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,26 @@ # calibre-plugin-language-cleaner Length list of regexes to "clean up" language in books. + +I wrote this plugin because I don't like reading vulgar language, but I like reading books with vulgar language in it :). Personally I find books much more enjoyable after being processed with this script. Obviously it is a personal set of filters, but I've done my best to make the changes sound as natural as possible, and after using it for years, I think it's pretty good. + +If you'd like to customize it to meet your preferences, you just need to go through the lines of cleaner.py and add or remove filters as needed. You'll probably need a pretty good mastery of regular expressions to write new ones unless there is a similar one existing already that you can tweak. + +== LIMITATIONS == + +I am no expert at calibre, and I could not drum up much help on the support forums, so the integration is pretty weak. It only works on books that are being converted from epub, and only works during the conversion process. + +To install: +* create a zip file with the three files: `cleaner.py`, `__init__.py`, and `plugin-import-name-language_clean_plugin.txt` called `Language_Cleaner.zip`. This command may help in Linux. +`zip Language_Cleaner cleaner.py __init__.py plugin-import-name-language_clean_plugin.txt` +* In calibre choose Preference -> Plugins -> Load plugin from file +* Choose the zip you just created, and the plugin should show up under "File type plugins" + +To use: +* Choose the book you'd like and make sure you have an epub format (so convert to epub if you don't already have that format) +* Now do "Convert book" and choose to convert from Epub to Epub (or whatever destination format you want) +* Wait until longer than usual job completes, due the very inefficient way this plugin works + +Secret debug tip: +If there is a "c:/Scratch/calibre" folder on your Winodws machine (change `logdir` in `__init__.py` if you want), the plugin will write before and after versions of the book as plain text files. Sometimes it does two copies and only one has useful changes. If you'd like to see how it was changed, compare the two files. I use [WinMerge](http://winmerge.org/) and that works well. + +By the way, there is a strong layer of irony here -- if vulgar language offends you, you'll probably want to avoid actually looking in the `cleaner.py` file, as it is chock full of it :) diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..67f6728 --- /dev/null +++ b/__init__.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function, with_statement) +logdir = "c:/Scratch/calibre" +__license__ = 'GPL v3' +__copyright__ = '2012, Jordan Anderson' +__docformat__ = 'restructuredtext en' + +#from __future__ import with_statement +import sys, os, time +from calibre.customize import FileTypePlugin +from optparse import OptionGroup, Option +from calibre.ebooks.tweak import * +from calibre_plugins.language_clean_plugin.cleaner import * +from functools import partial +import codecs, mimetypes + +class CleanerPlugin(FileTypePlugin): + + name = 'Language Cleaner' # Name of the plugin + description = 'Replace naughty or offensive language with something more acceptable (to me at least), recovered version' + supported_platforms = ['windows', 'osx', 'linux'] # Platforms this plugin will run on + author = 'Jordan Anderson' # The author of this plugin + version = (1, 5, 2017) # The version number of this plugin + file_types = set(['epub']) # The file types that this plugin will be applied to + on_preprocess = True # Run this plugin after conversion is complete + minimum_calibre_version = (0, 7, 53) + + def run(self, path_to_ebook): + #print ("*"*60,"\n","you are in Language Cleaner") + #print ("*"*60,"\n") + ebook_file=path_to_ebook + fmt = ebook_file.rpartition('.')[-1].lower() + exploder, rebuilder = get_tools(fmt) + with TemporaryDirectory('_tweak_'+ + os.path.basename(ebook_file).rpartition('.')[0]) as tdir: + #prints ("Relevant info:",tdir,fmt,ebook_file) + try: + opf = exploder(ebook_file, tdir) + except WorkerError as e: + prints('Failed to unpack', ebook_file) + prints(e.orig_tb) + raise SystemExit(1) + except Error as e: + prints(as_unicode(e), file=sys.stderr) + raise SystemExit(1) + #Debug + print ("Created tdir:",tdir,"and found opf",opf) + #print (os.popen("ll "+tdir).read()) + #print ("OPF CONTENTS:") + #print (open(opf,'r').read()) + #manipulate all of the files + opf = open(opf,'r').read().split('\n') + # first, assemble the entire text to evaluate context + text="" + for f in walk(tdir): + opf_line = [ii for ii in opf if os.path.basename(f).lower() in ii.lower()] + ftype = mimetypes.guess_type(f)[0] + if not ftype and "html" in f.split('.')[-1]: + print('Non-text type %s for file %s but forcing text mode'%(ftype, f)) + ftype = 'text' + if not ftype: + print('Non-text type %s for file %s'%(ftype, f)) + elif opf_line and 'text' in ftype: + encodings = ['utf-8', 'windows-1252', 'windows-1250'] + for e in encodings: + try: + text += codecs.open(f,'r',encoding=e).read() + except UnicodeDecodeError: + print('File %s: got unicode error with %s , trying different encoding' % (f,e)) + else: + print('File %s: opening the file with encoding: %s ' % (f,e)) + break + replacement_list = language_check(text) + start_text=text + end_text="" + #Now do replacements on each file + for f in walk(tdir): + opf_line = [ii for ii in opf if os.path.basename(f).lower() in ii.lower()] + #Not sure what the correct way to determine which files should + # be edited. Seems like most are marked 'application/' in type + print ("File",f,"\nOPF line:\n",opf_line) + ftype = mimetypes.guess_type(f)[0] + if not ftype and "html" in f.split('.')[-1]: + print('Non-text type %s for file %s but forcing text mode'%(ftype, f)) + ftype = 'text' + if not ftype: + print('Non-text type %s for file %s'%(ftype, f)) + elif opf_line and 'text' in ftype: + print ("Cleaning",f) + text = open(f,'r').read() + output = "" + for line in text.split("\n"): + #Go through all elements of replacement_list + for search,sub,pcase in replacement_list: + if pcase: # Preserve case + line = search.sub(partial(pcase,sub),line) + else: # Don't preserve case + line = search.sub(sub,line) + output += line + "\n" + open(f,'w').write(output) + end_text += output + if start_text.replace('\n',"") == end_text.replace('\n',''): + print ("Language cleaner made no changes") + else: + if os.path.exists(logdir): + open(logdir+os.sep+'%s_init.txt'%(os.path.basename(ebook_file)+str(time.time())),'w').write(start_text) + open(logdir+os.sep+'%s_mod.txt'%(os.path.basename(ebook_file)+str(time.time())),'w').write(end_text) + prints('Rebuilding', ebook_file, 'please wait ...') + try: + rebuilder(tdir, ebook_file) + except WorkerError as e: + prints('Failed to rebuild', ebook_file) + prints(e.orig_tb) + raise SystemExit(1) + prints(ebook_file, 'successfully cleaned') + + #print (path_to_ebook,ext,str(mi)) + #print ("you are returning from Language Cleaner") + return ebook_file + + diff --git a/cleaner.py b/cleaner.py new file mode 100644 index 0000000..698a580 --- /dev/null +++ b/cleaner.py @@ -0,0 +1,479 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +import os,sys,re +#text = open('bad.txt').read() + +#Substitute requested word matching case of matched word +def keep_case(sub,matchobj): + val = matchobj.group(0) + up_count = 0 + if val.isupper(): + sub = sub.upper() + else: + #Test first two to see if all uppercase + for ii in range(min(2,len(sub),len(val))): + if val[ii].isupper(): + up_count += 1 + sub = sub[:ii] + sub[ii].upper() + sub[ii+1:] + #Allow further uppercase only if all uppercase + for ii in range(min(len(sub),len(val))): + if up_count > 1: + up_count += 1 + sub = sub[:ii] + sub[ii].upper() + sub[ii+1:] + return sub + +def first_case(sub,matchobj): + val = matchobj.group(0) + if val.isupper(): + sub = sub.upper() + else: + try: + for ii in range(1): + if val[ii].isupper(): + sub = sub[:ii] + sub[ii].upper() + sub[ii+1:] + except: + print "*"*60,"sub=",sub,"val=",val,"*"*60 + return sub + +#Drop first match, match case of first and return second +def drop_first_match(sub,matchobj): + drop = matchobj.group(1) + val = matchobj.group(2) + try: + for ii in range(len(drop)): #find first alpha in drop + if drop[ii].isalpha() : + if drop[ii].isupper(): #uppercase, so copy to val + for jj in range(len(val)): #find first alpha in val + if val[jj].isalpha(): + val = val[:jj] + val[jj].upper() + val[jj+1:] + break + break + except: + print "*"*50, "error in drop_first_match" + print drop + print val + print str(sub) + print str(matchobj.groups()) + return val + +# Prepare two lists for different meanings of ass +dirty_a_list=[ + ######################################### + # dirtier ass + ######################################### + #haul ass + (re.compile(r'\b(move|haul)\Wass\b',re.I),"move fast",keep_case), + #little ass + (re.compile(r'little\W?ass\b',re.I),"little donkey",keep_case), + (re.compile(r'little\W?asses\b',re.I),"little donkeys",keep_case), + #your/own/etc. ass + (re.compile(r'(?<=(.your|..own|...my|..our|..her|..his|.this|.that|..the|their|those|these|..its|..for)\W)ass\b',re.I),"rear",keep_case), + (re.compile(r'(?<=(.your|..own|...my|..our|..her|..his|.this|.that|..the|their|those|these|..its|..for)\W)asses\b',re.I),"rears",keep_case), + #asses + (re.compile(r'\basses\b',re.I),"rears",keep_case), + #ass + (re.compile(r'\ban\Wass\b',re.I),"a jerk",keep_case), + (re.compile(r'\bass\b',re.I),"rear",keep_case), + ] + +clean_a_list=[ + ######################################### + # cleaner ass + ######################################### + #haul ass + (re.compile(r'\bhaul\Wass\b',re.I),"move fast",keep_case), + #asses + (re.compile(r'\basses\b',re.I),"donkeys",keep_case), + #ass + (re.compile(r'\ban Ass\b'),"a Donkey",False), #C.S. Lewis + (re.compile(r'\ban\Wass\b',re.I),"a donkey",keep_case), + (re.compile(r'(? worth a cent (couldn't think of a better word) + (re.compile(r'((matters?|worth|of)\W+a\W+)(?:gods? *)?damn\b',re.I),r'\1cent',False), + #of the damned + (re.compile(r'(of\W*the\W*)(?:gods? *)?damned\b',re.I),r'\1cursed',False), + #Your damned word, a damn word, etc + (re.compile(r'(your|our|her|his|this|that|the|their|hose|these|for|so|some|one|one more|too)( +)(?:gods? *)?damn(?:ed)?\b(?!-)',re.I),r'\1',False), + #a damn + (re.compile(r'(?<=\b[aA] )(?:gods? *)?damn(?:ed)',re.I),'darn',keep_case), + #damned good, damn sure, etc (Clancy's favorites) + (re.compile(r'\b((?:gods? *)?damn(?:ed))(?:\W+)(sure|near|sight|good|much|hard|easy|big|little|glad|clever|mess|smart|fine|fool|right|thing|much|shame|nice|mean|bad|lucky|late|important)',re.I),'',drop_first_match), + (re.compile(r'\b((?:gods? *)?damn(?:ed))(?:\W+)well',re.I),'darn well',keep_case), + #Religious damning + (re.compile(r'\b(?:gods? *)?damned',re.I),'cursed',keep_case), + (re.compile(r'\b(?:gods? *)?damndest',re.I),'very best',keep_case), + (re.compile(r'\b(?:gods? *)?damning',re.I),'condemning',keep_case), + (re.compile(r'\b(?:gods? *)?damnable',re.I),'condemning',keep_case), + (re.compile(r'\b(?:gods? *)?damnably',re.I),'cursedly',keep_case), + (re.compile(r'\b(?:gods? *)?damnatory',re.I),'condemning',keep_case), + (re.compile(r'\b(?:gods? *)?damnation',re.I),'condemnation',keep_case), + #damn it + (re.compile(r', (?:gods? *)?damn it(?: all)?',re.I),'',keep_case), + (re.compile(r'((?:gods? *)?damn it(?: all)?, +)(.)',re.I),'',drop_first_match), + #a damn something, like "a damn nuisance" + (re.compile(r'\ba(\W+)(?:gods? *)?damn',re.I),r'a\1blasted',False), + #damn you/his/her/etc + (re.compile(r'\b(?:gods? *)?damn you to hell',re.I),'curse you',keep_case), + (re.compile(r'\b(?:gods? *)?damn(?= (him|his|her|you|next|the|you))',re.I),'curse',keep_case), + #Word by itself + (re.compile(r'\b(?:gods? *)?damn\b',re.I),'dang',keep_case), + #Final catch-all + (re.compile(r'(?:gods? *)?damn',re.I),'dang',keep_case), + ######################################### + # Bitch + ######################################### + #Son of a bitch + (re.compile(r's[UuOo]n(s)?([ -])?[OoUu][FfVv]([ -])?(a)?([ -])?bitch(e)?',re.I),'jerk',keep_case), + #verb + (re.compile(r'bitchin[^\s]',re.U+re.I),'complaining',keep_case), + (re.compile(r'bitched',re.I),'complained',keep_case), + (re.compile(r'bitche?(?=s? abo)',re.I),'complain',keep_case), + (re.compile(r'(?<=(n([^\s]|o)t ))bitch',re.U+re.I),'complain',keep_case), + #A bitch + (re.compile(r's a bitch',re.I),'s tough',keep_case), + #Bitch by itself + (re.compile(r'\bbitch(e)?',re.I),'jerk',keep_case), + ######################################### + # Shit + ######################################### + #bullshit + (re.compile(r'\b(bull|horse|dog|jack)(.)?shit',re.I),'shit',keep_case), + #Holy shit + (re.compile(r'\bholy\W*shit',re.I),'incredible',keep_case), + #exclamantion + (re.compile(r'(?<=oh, )shit\b',re.I),'shoot',keep_case), + (re.compile(r'(?<=oh )shit\b',re.I),'shoot',keep_case), + (re.compile(r'(?