Initial commit

jdanders · May 25, 2017 · da8c7a6 · da8c7a6
1 parent 5975f8d
commit da8c7a6
Show file tree

Hide file tree

Showing 5 changed files with 626 additions and 0 deletions.
diff --git a/Language_Cleaner.zip b/Language_Cleaner.zip
diff --git a/README.md b/README.md
@@ -1,2 +1,26 @@
 # calibre-plugin-language-cleaner
 Length list of regexes to "clean up" language in books.
+
+I wrote this plugin because I don't like reading vulgar language, but I like reading books with vulgar language in it :). Personally I find books much more enjoyable after being processed with this script. Obviously it is a personal set of filters, but I've done my best to make the changes sound as natural as possible, and after using it for years, I think it's pretty good.
+
+If you'd like to customize it to meet your preferences, you just need to go through the lines of cleaner.py and add or remove filters as needed. You'll probably need a pretty good mastery of regular expressions to write new ones unless there is a similar one existing already that you can tweak.
+
+== LIMITATIONS ==
+
+I am no expert at calibre, and I could not drum up much help on the support forums, so the integration is pretty weak. It only works on books that are being converted from epub, and only works during the conversion process.
+
+To install:
+* create a zip file with the three files: `cleaner.py`, `__init__.py`, and `plugin-import-name-language_clean_plugin.txt` called `Language_Cleaner.zip`. This command may help in Linux.
+`zip Language_Cleaner cleaner.py __init__.py plugin-import-name-language_clean_plugin.txt`
+* In calibre choose Preference -> Plugins -> Load plugin from file
+* Choose the zip you just created, and the plugin should show up under "File type plugins"
+
+To use:
+* Choose the book you'd like and make sure you have an epub format (so convert to epub if you don't already have that format)
+* Now do "Convert book" and choose to convert from Epub to Epub (or whatever destination format you want)
+* Wait until longer than usual job completes, due the very inefficient way this plugin works
+
+Secret debug tip:
+If there is a "c:/Scratch/calibre" folder on your Winodws machine (change `logdir` in `__init__.py` if you want), the plugin will write before and after versions of the book as plain text files. Sometimes it does two copies and only one has useful changes. If you'd like to see how it was changed, compare the two files. I use [WinMerge](http://winmerge.org/) and that works well.
+
+By the way, there is a strong layer of irony here -- if vulgar language offends you, you'll probably want to avoid actually looking in the `cleaner.py` file, as it is chock full of it :)
diff --git a/__init__.py b/__init__.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function, with_statement)
+logdir = "c:/Scratch/calibre"
+__license__   = 'GPL v3'
+__copyright__ = '2012, Jordan Anderson'
+__docformat__ = 'restructuredtext en'
+
+#from __future__ import with_statement
+import sys, os, time
+from calibre.customize import FileTypePlugin
+from optparse import OptionGroup, Option
+from calibre.ebooks.tweak import *
+from calibre_plugins.language_clean_plugin.cleaner import *
+from functools import partial
+import codecs, mimetypes
+
+class CleanerPlugin(FileTypePlugin):
+
+    name                = 'Language Cleaner' # Name of the plugin
+    description         = 'Replace naughty or offensive language with something more acceptable (to me at least), recovered version'
+    supported_platforms = ['windows', 'osx', 'linux'] # Platforms this plugin will run on
+    author              = 'Jordan Anderson' # The author of this plugin
+    version             = (1, 5, 2017)   # The version number of this plugin
+    file_types          = set(['epub']) # The file types that this plugin will be applied to
+    on_preprocess       = True # Run this plugin after conversion is complete
+    minimum_calibre_version = (0, 7, 53)
+
+    def run(self, path_to_ebook):
+        #print ("*"*60,"\n","you are in Language Cleaner")
+        #print ("*"*60,"\n")
+        ebook_file=path_to_ebook
+        fmt = ebook_file.rpartition('.')[-1].lower()
+        exploder, rebuilder = get_tools(fmt)
+        with TemporaryDirectory('_tweak_'+
+            os.path.basename(ebook_file).rpartition('.')[0]) as tdir:
+            #prints ("Relevant info:",tdir,fmt,ebook_file)
+            try:
+                opf = exploder(ebook_file, tdir)
+            except WorkerError as e:
+                prints('Failed to unpack', ebook_file)
+                prints(e.orig_tb)
+                raise SystemExit(1)
+            except Error as e:
+                prints(as_unicode(e), file=sys.stderr)
+                raise SystemExit(1)
+            #Debug
+            print ("Created tdir:",tdir,"and found opf",opf)
+            #print (os.popen("ll "+tdir).read())
+            #print ("OPF CONTENTS:")
+            #print (open(opf,'r').read())
+            #manipulate all of the files
+            opf = open(opf,'r').read().split('\n')
+            # first, assemble the entire text to evaluate context
+            text=""
+            for f in walk(tdir):
+              opf_line = [ii for ii in opf if os.path.basename(f).lower() in ii.lower()]
+              ftype = mimetypes.guess_type(f)[0]
+              if not ftype and "html" in f.split('.')[-1]:
+                  print('Non-text type %s for file %s but forcing text mode'%(ftype, f))
+                  ftype = 'text'
+              if not ftype:
+                  print('Non-text type %s for file %s'%(ftype, f))
+              elif opf_line and 'text' in ftype:
+                encodings = ['utf-8', 'windows-1252', 'windows-1250']
+                for e in encodings:
+                    try:
+                        text += codecs.open(f,'r',encoding=e).read()
+                    except UnicodeDecodeError:
+                        print('File %s: got unicode error with %s , trying different encoding' % (f,e))
+                    else:
+                        print('File %s: opening the file with encoding:  %s ' % (f,e))
+                        break  
+            replacement_list = language_check(text)
+            start_text=text
+            end_text=""
+            #Now do replacements on each file
+            for f in walk(tdir):
+              opf_line = [ii for ii in opf if os.path.basename(f).lower() in ii.lower()]
+              #Not sure what the correct way to determine which files should
+              # be edited. Seems like most are marked 'application/' in type
+              print ("File",f,"\nOPF line:\n",opf_line)
+              ftype = mimetypes.guess_type(f)[0]
+              if not ftype and "html" in f.split('.')[-1]:
+                  print('Non-text type %s for file %s but forcing text mode'%(ftype, f))
+                  ftype = 'text'
+              if not ftype:
+                  print('Non-text type %s for file %s'%(ftype, f))
+              elif opf_line and 'text' in ftype:
+                print ("Cleaning",f)
+                text = open(f,'r').read()
+                output = ""
+                for line in text.split("\n"):
+                  #Go through all elements of replacement_list
+                  for search,sub,pcase in replacement_list:
+                    if pcase: # Preserve case
+                      line = search.sub(partial(pcase,sub),line)
+                    else: # Don't preserve case
+                      line = search.sub(sub,line)
+                  output += line + "\n"
+                open(f,'w').write(output)
+                end_text += output
+            if start_text.replace('\n',"") == end_text.replace('\n',''):
+                print ("Language cleaner made no changes")
+            else:
+                if os.path.exists(logdir):
+                    open(logdir+os.sep+'%s_init.txt'%(os.path.basename(ebook_file)+str(time.time())),'w').write(start_text)
+                    open(logdir+os.sep+'%s_mod.txt'%(os.path.basename(ebook_file)+str(time.time())),'w').write(end_text)
+            prints('Rebuilding', ebook_file, 'please wait ...')
+            try:
+                rebuilder(tdir, ebook_file)
+            except WorkerError as e:
+                prints('Failed to rebuild', ebook_file)
+                prints(e.orig_tb)
+                raise SystemExit(1)
+            prints(ebook_file, 'successfully cleaned')
+
+        #print (path_to_ebook,ext,str(mi))
+        #print ("you are returning from Language Cleaner")
+        return ebook_file
+
+