Skip to content

Commit

Permalink
add all files from langetect. including pycache
Browse files Browse the repository at this point in the history
  • Loading branch information
Stefan Lohmaier committed Jan 15, 2024
1 parent f8b514a commit 413819d
Show file tree
Hide file tree
Showing 19 changed files with 953 additions and 0 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
70 changes: 70 additions & 0 deletions addon/globalPlugins/langdetect/utils/lang_profile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from collections import defaultdict
import re

import six
from six.moves import xrange

from .ngram import NGram


class LangProfile(object):
MINIMUM_FREQ = 2
LESS_FREQ_RATIO = 100000

ROMAN_CHAR_RE = re.compile(r'^[A-Za-z]$')
ROMAN_SUBSTR_RE = re.compile(r'.*[A-Za-z].*')

def __init__(self, name=None, freq=None, n_words=None):
self.freq = defaultdict(int)
if freq is not None:
self.freq.update(freq)

if n_words is None:
n_words = [0] * NGram.N_GRAM

self.name = name
self.n_words = n_words

def add(self, gram):
'''Add n-gram to profile.'''
if self.name is None or gram is None: # Illegal
return
length = len(gram)
if length < 1 or length > NGram.N_GRAM: # Illegal
return
self.n_words[length - 1] += 1
self.freq[gram] += 1

def omit_less_freq(self):
'''Eliminate below less frequency n-grams and noise Latin alphabets.'''
if self.name is None: # Illegal
return
threshold = max(self.n_words[0] // self.LESS_FREQ_RATIO, self.MINIMUM_FREQ)

roman = 0
for key, count in list(six.iteritems(self.freq)):
if count <= threshold:
self.n_words[len(key)-1] -= count
del self.freq[key]
elif self.ROMAN_CHAR_RE.match(key):
roman += count

# roman check
if roman < self.n_words[0] // 3:
for key, count in list(six.iteritems(self.freq)):
if self.ROMAN_SUBSTR_RE.match(key):
self.n_words[len(key)-1] -= count
del self.freq[key]

def update(self, text):
'''Update the language profile with (fragmented) text.
Extract n-grams from text and add their frequency into the profile.
'''
if text is None:
return
text = NGram.normalize_vi(text)
gram = NGram()
for ch in text:
gram.add_char(ch)
for n in xrange(1, NGram.N_GRAM+1):
self.add(gram.get(n))
135 changes: 135 additions & 0 deletions addon/globalPlugins/langdetect/utils/messages.properties

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions addon/globalPlugins/langdetect/utils/messages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from os import path


class Messages(object):
MESSAGES_FILENAME = path.join(path.dirname(__file__), 'messages.properties')

def __init__(self):
self.messages = {}
with open(self.MESSAGES_FILENAME, 'r') as f:
for line in f:
key, _, value = line.strip().partition('=')
self.messages[key] = value.encode().decode('unicode_escape')

def get_string(self, key):
return self.messages.get(key, '!%s!' % key)


_messages = None
def get_string(key):
global _messages
if _messages is None:
_messages = Messages()
return _messages.get_string(key)
260 changes: 260 additions & 0 deletions addon/globalPlugins/langdetect/utils/ngram.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
import re

import six

from . import messages
from .unicode_block import (
unicode_block,
UNICODE_BASIC_LATIN,
UNICODE_LATIN_1_SUPPLEMENT,
UNICODE_LATIN_EXTENDED_B,
UNICODE_GENERAL_PUNCTUATION,
UNICODE_ARABIC,
UNICODE_LATIN_EXTENDED_ADDITIONAL,
UNICODE_HIRAGANA,
UNICODE_KATAKANA,
UNICODE_BOPOMOFO,
UNICODE_BOPOMOFO_EXTENDED,
UNICODE_CJK_UNIFIED_IDEOGRAPHS,
UNICODE_HANGUL_SYLLABLES,
)


class NGram(object):
LATIN1_EXCLUDED = messages.get_string('NGram.LATIN1_EXCLUDE')
N_GRAM = 3

def __init__(self):
self.grams = ' '
self.capitalword = False

def add_char(self, ch):
'''Append a character into ngram buffer.'''
ch = self.normalize(ch)
last_char = self.grams[-1]
if last_char == ' ':
self.grams = ' '
self.capitalword = False
if ch == ' ':
return
elif len(self.grams) >= self.N_GRAM:
self.grams = self.grams[1:]
self.grams += ch

if ch.isupper():
if last_char.isupper():
self.capitalword = True
else:
self.capitalword = False

def get(self, n):
'''Get n-gram.'''
if self.capitalword:
return
if n < 1 or n > self.N_GRAM or len(self.grams) < n:
return
if n == 1:
ch = self.grams[-1]
if ch == ' ':
return
return ch
else:
return self.grams[-n:]

@classmethod
def normalize(cls, ch):
block = unicode_block(ch)
if block == UNICODE_BASIC_LATIN:
if ch < 'A' or ('Z' < ch < 'a') or 'z' < ch:
ch = ' '
elif block == UNICODE_LATIN_1_SUPPLEMENT:
if cls.LATIN1_EXCLUDED.find(ch) >= 0:
ch = ' '
elif block == UNICODE_LATIN_EXTENDED_B:
# normalization for Romanian
if ch == six.u('\u0219'): # Small S with comma below => with cedilla
ch = six.u('\u015f')
if ch == six.u('\u021b'): # Small T with comma below => with cedilla
ch = six.u('\u0163')
elif block == UNICODE_GENERAL_PUNCTUATION:
ch = ' '
elif block == UNICODE_ARABIC:
if ch == six.u('\u06cc'):
ch = six.u('\u064a') # Farsi yeh => Arabic yeh
elif block == UNICODE_LATIN_EXTENDED_ADDITIONAL:
if ch >= six.u('\u1ea0'):
ch = six.u('\u1ec3')
elif block == UNICODE_HIRAGANA:
ch = six.u('\u3042')
elif block == UNICODE_KATAKANA:
ch = six.u('\u30a2')
elif block in (UNICODE_BOPOMOFO, UNICODE_BOPOMOFO_EXTENDED):
ch = six.u('\u3105')
elif block == UNICODE_CJK_UNIFIED_IDEOGRAPHS:
ch = cls.CJK_MAP.get(ch, ch)
elif block == UNICODE_HANGUL_SYLLABLES:
ch = six.u('\uac00')
return ch

@classmethod
def normalize_vi(cls, text):
'''Normalizer for Vietnamese.
Normalize Alphabet + Diacritical Mark(U+03xx) into U+1Exx.
'''
def repl(m):
alphabet = cls.TO_NORMALIZE_VI_CHARS.find(m.group(1))
dmark = cls.DMARK_CLASS.find(m.group(2)) # Diacritical Mark
return cls.NORMALIZED_VI_CHARS[dmark][alphabet]
return cls.ALPHABET_WITH_DMARK.sub(repl, text)

NORMALIZED_VI_CHARS = [
messages.get_string('NORMALIZED_VI_CHARS_0300'),
messages.get_string('NORMALIZED_VI_CHARS_0301'),
messages.get_string('NORMALIZED_VI_CHARS_0303'),
messages.get_string('NORMALIZED_VI_CHARS_0309'),
messages.get_string('NORMALIZED_VI_CHARS_0323')]
TO_NORMALIZE_VI_CHARS = messages.get_string('TO_NORMALIZE_VI_CHARS')
DMARK_CLASS = messages.get_string('DMARK_CLASS')
ALPHABET_WITH_DMARK = re.compile(
'([' + TO_NORMALIZE_VI_CHARS + '])([' + DMARK_CLASS + '])',
re.UNICODE)

# CJK Kanji Normalization Mapping
CJK_CLASS = [
messages.get_string('NGram.KANJI_1_0'),
messages.get_string('NGram.KANJI_1_2'),
messages.get_string('NGram.KANJI_1_4'),
messages.get_string('NGram.KANJI_1_8'),
messages.get_string('NGram.KANJI_1_11'),
messages.get_string('NGram.KANJI_1_12'),
messages.get_string('NGram.KANJI_1_13'),
messages.get_string('NGram.KANJI_1_14'),
messages.get_string('NGram.KANJI_1_16'),
messages.get_string('NGram.KANJI_1_18'),
messages.get_string('NGram.KANJI_1_22'),
messages.get_string('NGram.KANJI_1_27'),
messages.get_string('NGram.KANJI_1_29'),
messages.get_string('NGram.KANJI_1_31'),
messages.get_string('NGram.KANJI_1_35'),
messages.get_string('NGram.KANJI_2_0'),
messages.get_string('NGram.KANJI_2_1'),
messages.get_string('NGram.KANJI_2_4'),
messages.get_string('NGram.KANJI_2_9'),
messages.get_string('NGram.KANJI_2_10'),
messages.get_string('NGram.KANJI_2_11'),
messages.get_string('NGram.KANJI_2_12'),
messages.get_string('NGram.KANJI_2_13'),
messages.get_string('NGram.KANJI_2_15'),
messages.get_string('NGram.KANJI_2_16'),
messages.get_string('NGram.KANJI_2_18'),
messages.get_string('NGram.KANJI_2_21'),
messages.get_string('NGram.KANJI_2_22'),
messages.get_string('NGram.KANJI_2_23'),
messages.get_string('NGram.KANJI_2_28'),
messages.get_string('NGram.KANJI_2_29'),
messages.get_string('NGram.KANJI_2_30'),
messages.get_string('NGram.KANJI_2_31'),
messages.get_string('NGram.KANJI_2_32'),
messages.get_string('NGram.KANJI_2_35'),
messages.get_string('NGram.KANJI_2_36'),
messages.get_string('NGram.KANJI_2_37'),
messages.get_string('NGram.KANJI_2_38'),
messages.get_string('NGram.KANJI_3_1'),
messages.get_string('NGram.KANJI_3_2'),
messages.get_string('NGram.KANJI_3_3'),
messages.get_string('NGram.KANJI_3_4'),
messages.get_string('NGram.KANJI_3_5'),
messages.get_string('NGram.KANJI_3_8'),
messages.get_string('NGram.KANJI_3_9'),
messages.get_string('NGram.KANJI_3_11'),
messages.get_string('NGram.KANJI_3_12'),
messages.get_string('NGram.KANJI_3_13'),
messages.get_string('NGram.KANJI_3_15'),
messages.get_string('NGram.KANJI_3_16'),
messages.get_string('NGram.KANJI_3_18'),
messages.get_string('NGram.KANJI_3_19'),
messages.get_string('NGram.KANJI_3_22'),
messages.get_string('NGram.KANJI_3_23'),
messages.get_string('NGram.KANJI_3_27'),
messages.get_string('NGram.KANJI_3_29'),
messages.get_string('NGram.KANJI_3_30'),
messages.get_string('NGram.KANJI_3_31'),
messages.get_string('NGram.KANJI_3_32'),
messages.get_string('NGram.KANJI_3_35'),
messages.get_string('NGram.KANJI_3_36'),
messages.get_string('NGram.KANJI_3_37'),
messages.get_string('NGram.KANJI_3_38'),
messages.get_string('NGram.KANJI_4_0'),
messages.get_string('NGram.KANJI_4_9'),
messages.get_string('NGram.KANJI_4_10'),
messages.get_string('NGram.KANJI_4_16'),
messages.get_string('NGram.KANJI_4_17'),
messages.get_string('NGram.KANJI_4_18'),
messages.get_string('NGram.KANJI_4_22'),
messages.get_string('NGram.KANJI_4_24'),
messages.get_string('NGram.KANJI_4_28'),
messages.get_string('NGram.KANJI_4_34'),
messages.get_string('NGram.KANJI_4_39'),
messages.get_string('NGram.KANJI_5_10'),
messages.get_string('NGram.KANJI_5_11'),
messages.get_string('NGram.KANJI_5_12'),
messages.get_string('NGram.KANJI_5_13'),
messages.get_string('NGram.KANJI_5_14'),
messages.get_string('NGram.KANJI_5_18'),
messages.get_string('NGram.KANJI_5_26'),
messages.get_string('NGram.KANJI_5_29'),
messages.get_string('NGram.KANJI_5_34'),
messages.get_string('NGram.KANJI_5_39'),
messages.get_string('NGram.KANJI_6_0'),
messages.get_string('NGram.KANJI_6_3'),
messages.get_string('NGram.KANJI_6_9'),
messages.get_string('NGram.KANJI_6_10'),
messages.get_string('NGram.KANJI_6_11'),
messages.get_string('NGram.KANJI_6_12'),
messages.get_string('NGram.KANJI_6_16'),
messages.get_string('NGram.KANJI_6_18'),
messages.get_string('NGram.KANJI_6_20'),
messages.get_string('NGram.KANJI_6_21'),
messages.get_string('NGram.KANJI_6_22'),
messages.get_string('NGram.KANJI_6_23'),
messages.get_string('NGram.KANJI_6_25'),
messages.get_string('NGram.KANJI_6_28'),
messages.get_string('NGram.KANJI_6_29'),
messages.get_string('NGram.KANJI_6_30'),
messages.get_string('NGram.KANJI_6_32'),
messages.get_string('NGram.KANJI_6_34'),
messages.get_string('NGram.KANJI_6_35'),
messages.get_string('NGram.KANJI_6_37'),
messages.get_string('NGram.KANJI_6_39'),
messages.get_string('NGram.KANJI_7_0'),
messages.get_string('NGram.KANJI_7_3'),
messages.get_string('NGram.KANJI_7_6'),
messages.get_string('NGram.KANJI_7_7'),
messages.get_string('NGram.KANJI_7_9'),
messages.get_string('NGram.KANJI_7_11'),
messages.get_string('NGram.KANJI_7_12'),
messages.get_string('NGram.KANJI_7_13'),
messages.get_string('NGram.KANJI_7_16'),
messages.get_string('NGram.KANJI_7_18'),
messages.get_string('NGram.KANJI_7_19'),
messages.get_string('NGram.KANJI_7_20'),
messages.get_string('NGram.KANJI_7_21'),
messages.get_string('NGram.KANJI_7_23'),
messages.get_string('NGram.KANJI_7_25'),
messages.get_string('NGram.KANJI_7_28'),
messages.get_string('NGram.KANJI_7_29'),
messages.get_string('NGram.KANJI_7_32'),
messages.get_string('NGram.KANJI_7_33'),
messages.get_string('NGram.KANJI_7_35'),
messages.get_string('NGram.KANJI_7_37')]

CJK_MAP = {}

@classmethod
def _init_cjk_map(cls):
for cjk_list in cls.CJK_CLASS:
representative = cjk_list[0]
for ch in cjk_list:
cls.CJK_MAP[ch] = representative

NGram._init_cjk_map()
Loading

0 comments on commit 413819d

Please sign in to comment.