Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Jordan Anderson committed May 25, 2017
1 parent 5975f8d commit da8c7a6
Show file tree
Hide file tree
Showing 5 changed files with 626 additions and 0 deletions.
Binary file added Language_Cleaner.zip
Binary file not shown.
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,26 @@
# calibre-plugin-language-cleaner
Length list of regexes to "clean up" language in books.

I wrote this plugin because I don't like reading vulgar language, but I like reading books with vulgar language in it :). Personally I find books much more enjoyable after being processed with this script. Obviously it is a personal set of filters, but I've done my best to make the changes sound as natural as possible, and after using it for years, I think it's pretty good.

If you'd like to customize it to meet your preferences, you just need to go through the lines of cleaner.py and add or remove filters as needed. You'll probably need a pretty good mastery of regular expressions to write new ones unless there is a similar one existing already that you can tweak.

== LIMITATIONS ==

I am no expert at calibre, and I could not drum up much help on the support forums, so the integration is pretty weak. It only works on books that are being converted from epub, and only works during the conversion process.

To install:
* create a zip file with the three files: `cleaner.py`, `__init__.py`, and `plugin-import-name-language_clean_plugin.txt` called `Language_Cleaner.zip`. This command may help in Linux.
`zip Language_Cleaner cleaner.py __init__.py plugin-import-name-language_clean_plugin.txt`
* In calibre choose Preference -> Plugins -> Load plugin from file
* Choose the zip you just created, and the plugin should show up under "File type plugins"

To use:
* Choose the book you'd like and make sure you have an epub format (so convert to epub if you don't already have that format)
* Now do "Convert book" and choose to convert from Epub to Epub (or whatever destination format you want)
* Wait until longer than usual job completes, due the very inefficient way this plugin works

Secret debug tip:
If there is a "c:/Scratch/calibre" folder on your Winodws machine (change `logdir` in `__init__.py` if you want), the plugin will write before and after versions of the book as plain text files. Sometimes it does two copies and only one has useful changes. If you'd like to see how it was changed, compare the two files. I use [WinMerge](http://winmerge.org/) and that works well.

By the way, there is a strong layer of irony here -- if vulgar language offends you, you'll probably want to avoid actually looking in the `cleaner.py` file, as it is chock full of it :)
123 changes: 123 additions & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function, with_statement)
logdir = "c:/Scratch/calibre"
__license__ = 'GPL v3'
__copyright__ = '2012, Jordan Anderson'
__docformat__ = 'restructuredtext en'

#from __future__ import with_statement
import sys, os, time
from calibre.customize import FileTypePlugin
from optparse import OptionGroup, Option
from calibre.ebooks.tweak import *
from calibre_plugins.language_clean_plugin.cleaner import *
from functools import partial
import codecs, mimetypes

class CleanerPlugin(FileTypePlugin):

name = 'Language Cleaner' # Name of the plugin
description = 'Replace naughty or offensive language with something more acceptable (to me at least), recovered version'
supported_platforms = ['windows', 'osx', 'linux'] # Platforms this plugin will run on
author = 'Jordan Anderson' # The author of this plugin
version = (1, 5, 2017) # The version number of this plugin
file_types = set(['epub']) # The file types that this plugin will be applied to
on_preprocess = True # Run this plugin after conversion is complete
minimum_calibre_version = (0, 7, 53)

def run(self, path_to_ebook):
#print ("*"*60,"\n","you are in Language Cleaner")
#print ("*"*60,"\n")
ebook_file=path_to_ebook
fmt = ebook_file.rpartition('.')[-1].lower()
exploder, rebuilder = get_tools(fmt)
with TemporaryDirectory('_tweak_'+
os.path.basename(ebook_file).rpartition('.')[0]) as tdir:
#prints ("Relevant info:",tdir,fmt,ebook_file)
try:
opf = exploder(ebook_file, tdir)
except WorkerError as e:
prints('Failed to unpack', ebook_file)
prints(e.orig_tb)
raise SystemExit(1)
except Error as e:
prints(as_unicode(e), file=sys.stderr)
raise SystemExit(1)
#Debug
print ("Created tdir:",tdir,"and found opf",opf)
#print (os.popen("ll "+tdir).read())
#print ("OPF CONTENTS:")
#print (open(opf,'r').read())
#manipulate all of the files
opf = open(opf,'r').read().split('\n')
# first, assemble the entire text to evaluate context
text=""
for f in walk(tdir):
opf_line = [ii for ii in opf if os.path.basename(f).lower() in ii.lower()]
ftype = mimetypes.guess_type(f)[0]
if not ftype and "html" in f.split('.')[-1]:
print('Non-text type %s for file %s but forcing text mode'%(ftype, f))
ftype = 'text'
if not ftype:
print('Non-text type %s for file %s'%(ftype, f))
elif opf_line and 'text' in ftype:
encodings = ['utf-8', 'windows-1252', 'windows-1250']
for e in encodings:
try:
text += codecs.open(f,'r',encoding=e).read()
except UnicodeDecodeError:
print('File %s: got unicode error with %s , trying different encoding' % (f,e))
else:
print('File %s: opening the file with encoding: %s ' % (f,e))
break
replacement_list = language_check(text)
start_text=text
end_text=""
#Now do replacements on each file
for f in walk(tdir):
opf_line = [ii for ii in opf if os.path.basename(f).lower() in ii.lower()]
#Not sure what the correct way to determine which files should
# be edited. Seems like most are marked 'application/' in type
print ("File",f,"\nOPF line:\n",opf_line)
ftype = mimetypes.guess_type(f)[0]
if not ftype and "html" in f.split('.')[-1]:
print('Non-text type %s for file %s but forcing text mode'%(ftype, f))
ftype = 'text'
if not ftype:
print('Non-text type %s for file %s'%(ftype, f))
elif opf_line and 'text' in ftype:
print ("Cleaning",f)
text = open(f,'r').read()
output = ""
for line in text.split("\n"):
#Go through all elements of replacement_list
for search,sub,pcase in replacement_list:
if pcase: # Preserve case
line = search.sub(partial(pcase,sub),line)
else: # Don't preserve case
line = search.sub(sub,line)
output += line + "\n"
open(f,'w').write(output)
end_text += output
if start_text.replace('\n',"") == end_text.replace('\n',''):
print ("Language cleaner made no changes")
else:
if os.path.exists(logdir):
open(logdir+os.sep+'%s_init.txt'%(os.path.basename(ebook_file)+str(time.time())),'w').write(start_text)
open(logdir+os.sep+'%s_mod.txt'%(os.path.basename(ebook_file)+str(time.time())),'w').write(end_text)
prints('Rebuilding', ebook_file, 'please wait ...')
try:
rebuilder(tdir, ebook_file)
except WorkerError as e:
prints('Failed to rebuild', ebook_file)
prints(e.orig_tb)
raise SystemExit(1)
prints(ebook_file, 'successfully cleaned')

#print (path_to_ebook,ext,str(mi))
#print ("you are returning from Language Cleaner")
return ebook_file


Loading

0 comments on commit da8c7a6

Please sign in to comment.