-
Notifications
You must be signed in to change notification settings - Fork 0
/
chardet
executable file
·90 lines (81 loc) · 2.83 KB
/
chardet
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys, argparse, logging, magic
from extras import ColoredArgParser
from extras import init_colored_logger
from extras import PrettyVTable
from extras import ColoredSetting
DEFAULT_MODULE = 'UnicodeDammit' # bs4.UnicodeDammit
def parse_args():
parser = ColoredArgParser(description = 'Universal character encoding detector.')
parser.add_argument('-m', '--module',
action = 'append',
choices = ['UnicodeDammit', 'chardet', 'cchardet'],
help = 'Specify which modules or libraries to used. Supported bs4.UnicodeDammit, chardet, cchardet. the default is bs4.UnicodeDammit.')
parser.add_argument('--color',
choices = ['auto', 'always', 'never'],
default = 'auto',
dest = 'colored',
help = 'When to colored, the default is auto.')
parser.add_argument('files',
nargs = '*',
type = argparse.FileType(mode = 'rb'),
default = ( sys.stdin, ),
help = 'Input file.')
return parser.parse_args()
def on_unicodedammit(bs):
try:
import bs4
dammit = bs4.UnicodeDammit(bs)
return dammit.unicode_markup, str({ 'original encoding': dammit.original_encoding, 'tried encodings' : dammit.tried_encodings})
except:
raise
def on_chardet(bs):
try:
import chardet
encoding = chardet.detect(bs)
return bs.decode(encoding = encoding['encoding']), str(encoding)
except:
raise
def on_cchardet(bs):
try:
import cchardet
encoding = cchardet.detect(bs)
return bs.decode(encoding = encoding['encoding']), str(encoding)
except:
raise
module_switcher = {
'UnicodeDammit' : on_unicodedammit,
'chardet' : on_chardet,
'cchardet' : on_cchardet,
}
def main(args = None):
if args is None:
return 0
ColoredSetting(args.colored)
init_colored_logger()
pt = PrettyVTable(enable_painting = ColoredSetting().is_colorize(sys.stdout))
pt.add_column('>')
pt.add_column('<')
pt.add_field('Module:')
pt.add_field('File:')
pt.add_field('Encoding:')
pt.add_field('File type:')
pt.add_field('MIME type:')
for file in args.files:
if file == sys.stdin:
file = sys.stdin.buffer
print('Press Ctrl-D when finished.', flush = True)
bs = file.read()
for m in args.module or ( DEFAULT_MODULE, ):
try:
_, encoding = module_switcher.get(m)(bs)
ft = magic.from_buffer(bs)
mime = magic.from_buffer(bs, mime = True)
except Exception as e:
logging.getLogger(__name__).error(file + ': ' + str(e))
continue
pt.add_record(m, file.name, encoding, ft, mime)
print(pt)
if __name__ == '__main__':
sys.exit(main(parse_args()))