From 41e1c3f94148b779e95db29ecec84eda114972c2 Mon Sep 17 00:00:00 2001 From: quantran-novobi Date: Thu, 3 Nov 2022 10:15:24 +0700 Subject: [PATCH] Fix google_trans_new bugs --- epub-translator.py | 2 +- google_trans_new.py | 428 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 429 insertions(+), 1 deletion(-) create mode 100644 google_trans_new.py diff --git a/epub-translator.py b/epub-translator.py index d63d231..cb3c960 100644 --- a/epub-translator.py +++ b/epub-translator.py @@ -13,7 +13,7 @@ from bs4 import element from google_trans_new import google_translator -tool_version = '1.0.0' +tool_version = '1.0.1' LINE_SIZE = 90 HEADERS = { 'user-agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36')} diff --git a/google_trans_new.py b/google_trans_new.py new file mode 100644 index 0000000..94b5c1f --- /dev/null +++ b/google_trans_new.py @@ -0,0 +1,428 @@ +# coding:utf-8 +# author LuShan +# version : 1.1.9 +import json, requests, random, re +from urllib.parse import quote +import urllib3 +import logging + +log = logging.getLogger(__name__) +log.addHandler(logging.NullHandler()) + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +LANGUAGES = { + 'af': 'afrikaans', + 'sq': 'albanian', + 'am': 'amharic', + 'ar': 'arabic', + 'hy': 'armenian', + 'az': 'azerbaijani', + 'eu': 'basque', + 'be': 'belarusian', + 'bn': 'bengali', + 'bs': 'bosnian', + 'bg': 'bulgarian', + 'ca': 'catalan', + 'ceb': 'cebuano', + 'ny': 'chichewa', + 'zh-cn': 'chinese (simplified)', + 'zh-tw': 'chinese (traditional)', + 'co': 'corsican', + 'hr': 'croatian', + 'cs': 'czech', + 'da': 'danish', + 'nl': 'dutch', + 'en': 'english', + 'eo': 'esperanto', + 'et': 'estonian', + 'tl': 'filipino', + 'fi': 'finnish', + 'fr': 'french', + 'fy': 'frisian', + 'gl': 'galician', + 'ka': 'georgian', + 'de': 'german', + 'el': 'greek', + 'gu': 'gujarati', + 'ht': 'haitian creole', + 'ha': 'hausa', + 'haw': 'hawaiian', + 'iw': 'hebrew', + 'he': 'hebrew', + 'hi': 'hindi', + 'hmn': 'hmong', + 'hu': 'hungarian', + 'is': 'icelandic', + 'ig': 'igbo', + 'id': 'indonesian', + 'ga': 'irish', + 'it': 'italian', + 'ja': 'japanese', + 'jw': 'javanese', + 'kn': 'kannada', + 'kk': 'kazakh', + 'km': 'khmer', + 'ko': 'korean', + 'ku': 'kurdish (kurmanji)', + 'ky': 'kyrgyz', + 'lo': 'lao', + 'la': 'latin', + 'lv': 'latvian', + 'lt': 'lithuanian', + 'lb': 'luxembourgish', + 'mk': 'macedonian', + 'mg': 'malagasy', + 'ms': 'malay', + 'ml': 'malayalam', + 'mt': 'maltese', + 'mi': 'maori', + 'mr': 'marathi', + 'mn': 'mongolian', + 'my': 'myanmar (burmese)', + 'ne': 'nepali', + 'no': 'norwegian', + 'or': 'odia', + 'ps': 'pashto', + 'fa': 'persian', + 'pl': 'polish', + 'pt': 'portuguese', + 'pa': 'punjabi', + 'ro': 'romanian', + 'ru': 'russian', + 'sm': 'samoan', + 'gd': 'scots gaelic', + 'sr': 'serbian', + 'st': 'sesotho', + 'sn': 'shona', + 'sd': 'sindhi', + 'si': 'sinhala', + 'sk': 'slovak', + 'sl': 'slovenian', + 'so': 'somali', + 'es': 'spanish', + 'su': 'sundanese', + 'sw': 'swahili', + 'sv': 'swedish', + 'tg': 'tajik', + 'ta': 'tamil', + 'te': 'telugu', + 'th': 'thai', + 'tr': 'turkish', + 'tk': 'turkmen', + 'uk': 'ukrainian', + 'ur': 'urdu', + 'ug': 'uyghur', + 'uz': 'uzbek', + 'vi': 'vietnamese', + 'cy': 'welsh', + 'xh': 'xhosa', + 'yi': 'yiddish', + 'yo': 'yoruba', + 'zu': 'zulu', +} + +DEFAULT_SERVICE_URLS = ('translate.google.ac','translate.google.ad','translate.google.ae', + 'translate.google.al','translate.google.am','translate.google.as', + 'translate.google.at','translate.google.az','translate.google.ba', + 'translate.google.be','translate.google.bf','translate.google.bg', + 'translate.google.bi','translate.google.bj','translate.google.bs', + 'translate.google.bt','translate.google.by','translate.google.ca', + 'translate.google.cat','translate.google.cc','translate.google.cd', + 'translate.google.cf','translate.google.cg','translate.google.ch', + 'translate.google.ci','translate.google.cl','translate.google.cm', + 'translate.google.cn','translate.google.co.ao','translate.google.co.bw', + 'translate.google.co.ck','translate.google.co.cr','translate.google.co.id', + 'translate.google.co.il','translate.google.co.in','translate.google.co.jp', + 'translate.google.co.ke','translate.google.co.kr','translate.google.co.ls', + 'translate.google.co.ma','translate.google.co.mz','translate.google.co.nz', + 'translate.google.co.th','translate.google.co.tz','translate.google.co.ug', + 'translate.google.co.uk','translate.google.co.uz','translate.google.co.ve', + 'translate.google.co.vi','translate.google.co.za','translate.google.co.zm', + 'translate.google.co.zw','translate.google.co','translate.google.com.af', + 'translate.google.com.ag','translate.google.com.ai','translate.google.com.ar', + 'translate.google.com.au','translate.google.com.bd','translate.google.com.bh', + 'translate.google.com.bn','translate.google.com.bo','translate.google.com.br', + 'translate.google.com.bz','translate.google.com.co','translate.google.com.cu', + 'translate.google.com.cy','translate.google.com.do','translate.google.com.ec', + 'translate.google.com.eg','translate.google.com.et','translate.google.com.fj', + 'translate.google.com.gh','translate.google.com.gi','translate.google.com.gt', + 'translate.google.com.hk','translate.google.com.jm','translate.google.com.kh', + 'translate.google.com.kw','translate.google.com.lb','translate.google.com.lc', + 'translate.google.com.ly','translate.google.com.mm','translate.google.com.mt', + 'translate.google.com.mx','translate.google.com.my','translate.google.com.na', + 'translate.google.com.ng','translate.google.com.ni','translate.google.com.np', + 'translate.google.com.om','translate.google.com.pa','translate.google.com.pe', + 'translate.google.com.pg','translate.google.com.ph','translate.google.com.pk', + 'translate.google.com.pr','translate.google.com.py','translate.google.com.qa', + 'translate.google.com.sa','translate.google.com.sb','translate.google.com.sg', + 'translate.google.com.sl','translate.google.com.sv','translate.google.com.tj', + 'translate.google.com.tr','translate.google.com.tw','translate.google.com.ua', + 'translate.google.com.uy','translate.google.com.vc','translate.google.com.vn', + 'translate.google.com','translate.google.cv','translate.google.cx', + 'translate.google.cz','translate.google.de','translate.google.dj', + 'translate.google.dk','translate.google.dm','translate.google.dz', + 'translate.google.ee','translate.google.es','translate.google.eu', + 'translate.google.fi','translate.google.fm','translate.google.fr', + 'translate.google.ga','translate.google.ge','translate.google.gf', + 'translate.google.gg','translate.google.gl','translate.google.gm', + 'translate.google.gp','translate.google.gr','translate.google.gy', + 'translate.google.hn','translate.google.hr','translate.google.ht', + 'translate.google.hu','translate.google.ie','translate.google.im', + 'translate.google.io','translate.google.iq','translate.google.is', + 'translate.google.it','translate.google.je','translate.google.jo', + 'translate.google.kg','translate.google.ki','translate.google.kz', + 'translate.google.la','translate.google.li','translate.google.lk', + 'translate.google.lt','translate.google.lu','translate.google.lv', + 'translate.google.md','translate.google.me','translate.google.mg', + 'translate.google.mk','translate.google.ml','translate.google.mn', + 'translate.google.ms','translate.google.mu','translate.google.mv', + 'translate.google.mw','translate.google.ne','translate.google.nf', + 'translate.google.nl','translate.google.no','translate.google.nr', + 'translate.google.nu','translate.google.pl','translate.google.pn', + 'translate.google.ps','translate.google.pt','translate.google.ro', + 'translate.google.rs','translate.google.ru','translate.google.rw', + 'translate.google.sc','translate.google.se','translate.google.sh', + 'translate.google.si','translate.google.sk','translate.google.sm', + 'translate.google.sn','translate.google.so','translate.google.sr', + 'translate.google.st','translate.google.td','translate.google.tg', + 'translate.google.tk','translate.google.tl','translate.google.tm', + 'translate.google.tn','translate.google.to','translate.google.tt', + 'translate.google.us','translate.google.vg','translate.google.vu','translate.google.ws') + +URLS_SUFFIX = [re.search('translate.google.(.*)', url.strip()).group(1) for url in DEFAULT_SERVICE_URLS] +URL_SUFFIX_DEFAULT = 'com' + +class google_new_transError(Exception): + """Exception that uses context to present a meaningful error message""" + + def __init__(self, msg=None, **kwargs): + self.tts = kwargs.pop('tts', None) + self.rsp = kwargs.pop('response', None) + if msg: + self.msg = msg + elif self.tts is not None: + self.msg = self.infer_msg(self.tts, self.rsp) + else: + self.msg = None + super(google_new_transError, self).__init__(self.msg) + + def infer_msg(self, tts, rsp=None): + cause = "Unknown" + + if rsp is None: + premise = "Failed to connect" + + return "{}. Probable cause: {}".format(premise, "timeout") + # if tts.tld != 'com': + # host = _translate_url(tld=tts.tld) + # cause = "Host '{}' is not reachable".format(host) + + else: + status = rsp.status_code + reason = rsp.reason + + premise = "{:d} ({}) from TTS API".format(status, reason) + + if status == 403: + cause = "Bad token or upstream API changes" + elif status == 200 and not tts.lang_check: + cause = "No audio stream in response. Unsupported language '%s'" % self.tts.lang + elif status >= 500: + cause = "Uptream API error. Try again later." + + return "{}. Probable cause: {}".format(premise, cause) + + +class google_translator: + ''' + You can use 108 language in target and source,details view LANGUAGES. + Target language: like 'en'、'zh'、'th'... + + :param url_suffix: The source text(s) to be translated. Batch translation is supported via sequence input. + The value should be one of the url_suffix listed in : `DEFAULT_SERVICE_URLS` + :type url_suffix: UTF-8 :class:`str`; :class:`unicode`; string sequence (list, tuple, iterator, generator) + + :param text: The source text(s) to be translated. + :type text: UTF-8 :class:`str`; :class:`unicode`; + + :param lang_tgt: The language to translate the source text into. + The value should be one of the language codes listed in : `LANGUAGES` + :type lang_tgt: :class:`str`; :class:`unicode` + + :param lang_src: The language of the source text. + The value should be one of the language codes listed in :const:`googletrans.LANGUAGES` + If a language is not specified, + the system will attempt to identify the source language automatically. + :type lang_src: :class:`str`; :class:`unicode` + + :param timeout: Timeout Will be used for every request. + :type timeout: number or a double of numbers + + :param proxies: proxies Will be used for every request. + :type proxies: class : dict; like: {'http': 'http:171.112.169.47:19934/', 'https': 'https:171.112.169.47:19934/'} + + ''' + + def __init__(self, url_suffix="com", timeout=5, proxies=None): + self.proxies = proxies + if url_suffix not in URLS_SUFFIX: + self.url_suffix = URL_SUFFIX_DEFAULT + else: + self.url_suffix = url_suffix + url_base = "https://translate.google.{}".format(self.url_suffix) + self.url = url_base + "/_/TranslateWebserverUi/data/batchexecute" + self.timeout = timeout + + def _package_rpc(self, text, lang_src='auto', lang_tgt='auto'): + GOOGLE_TTS_RPC = ["MkEWBc"] + parameter = [[text.strip(), lang_src, lang_tgt, True], [1]] + escaped_parameter = json.dumps(parameter, separators=(',', ':')) + rpc = [[[random.choice(GOOGLE_TTS_RPC), escaped_parameter, None, "generic"]]] + espaced_rpc = json.dumps(rpc, separators=(',', ':')) + # text_urldecode = quote(text.strip()) + freq_initial = "f.req={}&".format(quote(espaced_rpc)) + freq = freq_initial + return freq + + def translate(self, text, lang_tgt='auto', lang_src='auto', pronounce=False): + try: + lang = LANGUAGES[lang_src] + except: + lang_src = 'auto' + try: + lang = LANGUAGES[lang_tgt] + except: + lang_src = 'auto' + text = str(text) + if len(text) >= 5000: + return "Warning: Can only detect less than 5000 characters" + if len(text) == 0: + return "" + headers = { + "Referer": "http://translate.google.{}/".format(self.url_suffix), + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; WOW64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/47.0.2526.106 Safari/537.36", + "Content-Type": "application/x-www-form-urlencoded;charset=utf-8" + } + freq = self._package_rpc(text, lang_src, lang_tgt) + response = requests.Request(method='POST', + url=self.url, + data=freq, + headers=headers, + ) + try: + if self.proxies == None or type(self.proxies) != dict: + self.proxies = {} + with requests.Session() as s: + s.proxies = self.proxies + r = s.send(request=response.prepare(), + verify=False, + timeout=self.timeout) + for line in r.iter_lines(chunk_size=1024): + decoded_line = line.decode('utf-8') + if "MkEWBc" in decoded_line: + try: + response = (decoded_line) + response = json.loads(response) + response = list(response) + response = json.loads(response[0][2]) + response_ = list(response) + response = response_[1][0] + if len(response) == 1: + if len(response[0]) > 5: + sentences = response[0][5] + else: ## only url + sentences = response[0][0] + if pronounce == False: + return sentences + elif pronounce == True: + return [sentences,None,None] + translate_text = "" + for sentence in sentences: + sentence = sentence[0] + translate_text += sentence.strip() + ' ' + translate_text = translate_text + if pronounce == False: + return translate_text + elif pronounce == True: + pronounce_src = (response_[0][0]) + pronounce_tgt = (response_[1][0][0][1]) + return [translate_text, pronounce_src, pronounce_tgt] + elif len(response) == 2: + sentences = [] + for i in response: + sentences.append(i[0]) + if pronounce == False: + return sentences + elif pronounce == True: + pronounce_src = (response_[0][0]) + pronounce_tgt = (response_[1][0][0][1]) + return [sentences, pronounce_src, pronounce_tgt] + except Exception as e: + raise e + r.raise_for_status() + except requests.exceptions.ConnectTimeout as e: + raise e + except requests.exceptions.HTTPError as e: + # Request successful, bad response + raise google_new_transError(tts=self, response=r) + except requests.exceptions.RequestException as e: + # Request failed + raise google_new_transError(tts=self) + + def detect(self, text): + text = str(text) + if len(text) >= 5000: + return log.debug("Warning: Can only detect less than 5000 characters") + if len(text) == 0: + return "" + headers = { + "Referer": "http://translate.google.{}/".format(self.url_suffix), + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; WOW64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/47.0.2526.106 Safari/537.36", + "Content-Type": "application/x-www-form-urlencoded;charset=utf-8" + } + freq = self._package_rpc(text) + response = requests.Request(method='POST', + url=self.url, + data=freq, + headers=headers) + try: + if self.proxies == None or type(self.proxies) != dict: + self.proxies = {} + with requests.Session() as s: + s.proxies = self.proxies + r = s.send(request=response.prepare(), + verify=False, + timeout=self.timeout) + + for line in r.iter_lines(chunk_size=1024): + decoded_line = line.decode('utf-8') + if "MkEWBc" in decoded_line: + # regex_str = r"\[\[\"wrb.fr\",\"MkEWBc\",\"\[\[(.*).*?,\[\[\[" + try: + # data_got = re.search(regex_str,decoded_line).group(1) + response = (decoded_line) + response = json.loads(response) + response = list(response) + response = json.loads(response[0][2]) + response = list(response) + detect_lang = response[0][2] + except Exception: + raise Exception + # data_got = data_got.split('\\\"]')[0] + return [detect_lang, LANGUAGES[detect_lang.lower()]] + r.raise_for_status() + except requests.exceptions.HTTPError as e: + # Request successful, bad response + log.debug(str(e)) + raise google_new_transError(tts=self, response=r) + except requests.exceptions.RequestException as e: + # Request failed + log.debug(str(e)) + raise google_new_transError(tts=self) \ No newline at end of file