|
""" |
|
This file bundles language identification functions. |
|
|
|
Modifications (fork): Copyright (c) 2021, Adrien Barbaresi. |
|
|
|
Original code: Copyright (c) 2011 Marco Lui <saffsd@gmail.com>. |
|
Based on research by Marco Lui and Tim Baldwin. |
|
|
|
See LICENSE file for more info. |
|
https://github.com/adbar/py3langid |
|
|
|
Projects: |
|
https://github.com/juntaosun/LangSegment |
|
""" |
|
|
|
import os |
|
import re |
|
import sys |
|
import numpy as np |
|
from collections import Counter |
|
from collections import defaultdict |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from py3langid.langid import LanguageIdentifier, MODEL_FILE |
|
|
|
|
|
try:from .utils.num import num2str |
|
except ImportError: |
|
try:from utils.num import num2str |
|
except ImportError as e: |
|
raise e |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LangSSML: |
|
|
|
def __init__(self): |
|
|
|
self._zh_numerals_number = { |
|
'0': '零', |
|
'1': '一', |
|
'2': '二', |
|
'3': '三', |
|
'4': '四', |
|
'5': '五', |
|
'6': '六', |
|
'7': '七', |
|
'8': '八', |
|
'9': '九' |
|
} |
|
|
|
|
|
|
|
def _format_chinese_data(self, date_str:str): |
|
|
|
input_date = date_str |
|
if date_str is None or date_str.strip() == "":return "" |
|
date_str = re.sub(r"[\/\._|年|月]","-",date_str) |
|
date_str = re.sub(r"日",r"",date_str) |
|
date_arrs = date_str.split(' ') |
|
if len(date_arrs) == 1 and ":" in date_arrs[0]: |
|
time_str = date_arrs[0] |
|
date_arrs = [] |
|
else: |
|
time_str = date_arrs[1] if len(date_arrs) >=2 else "" |
|
def nonZero(num,cn,func=None): |
|
if func is not None:num=func(num) |
|
return f"{num}{cn}" if num is not None and num != "" and num != "0" else "" |
|
f_number = self.to_chinese_number |
|
f_currency = self.to_chinese_currency |
|
|
|
year_month_day = "" |
|
if len(date_arrs) > 0: |
|
year, month, day = "","","" |
|
parts = date_arrs[0].split('-') |
|
if len(parts) == 3: |
|
year, month, day = parts |
|
elif len(parts) == 2: |
|
if len(parts[0]) == 4: |
|
year, month = parts |
|
else:month, day = parts |
|
elif len(parts[0]) > 0: |
|
if len(parts[0]) == 4: |
|
year = parts[0] |
|
else:day = parts[0] |
|
year,month,day = nonZero(year,"年",f_number),nonZero(month,"月",f_currency),nonZero(day,"日",f_currency) |
|
year_month_day = re.sub(r"([年|月|日])+",r"\1",f"{year}{month}{day}") |
|
|
|
time_str = re.sub(r"[\/\.\-:_]",":",time_str) |
|
time_arrs = time_str.split(":") |
|
hours, minutes, seconds = "","","" |
|
if len(time_arrs) == 3: |
|
hours, minutes, seconds = time_arrs |
|
elif len(time_arrs) == 2: |
|
hours, minutes = time_arrs |
|
elif len(time_arrs[0]) > 0:hours = f'{time_arrs[0]}点' |
|
if len(time_arrs) > 1: |
|
hours, minutes, seconds = nonZero(hours,"点",f_currency),nonZero(minutes,"分",f_currency),nonZero(seconds,"秒",f_currency) |
|
hours_minutes_seconds = re.sub(r"([点|分|秒])+",r"\1",f"{hours}{minutes}{seconds}") |
|
output_date = f"{year_month_day}{hours_minutes_seconds}" |
|
return output_date |
|
|
|
|
|
|
|
def to_chinese_number(self, num:str): |
|
pattern = r'(\d+)' |
|
zh_numerals = self._zh_numerals_number |
|
arrs = re.split(pattern, num) |
|
output = "" |
|
for item in arrs: |
|
if re.match(pattern,item): |
|
output += ''.join(zh_numerals[digit] if digit in zh_numerals else "" for digit in str(item)) |
|
else:output += item |
|
output = output.replace(".","点") |
|
return output |
|
|
|
|
|
|
|
def to_chinese_telephone(self, num:str): |
|
output = self.to_chinese_number(num.replace("+86","")) |
|
output = output.replace("一","幺") |
|
return output |
|
|
|
|
|
|
|
def to_chinese_currency(self, num:str): |
|
pattern = r'(\d+)' |
|
arrs = re.split(pattern, num) |
|
output = "" |
|
for item in arrs: |
|
if re.match(pattern,item): |
|
output += num2str(item) |
|
else:output += item |
|
output = output.replace(".","点") |
|
return output |
|
|
|
|
|
def to_chinese_date(self, num:str): |
|
chinese_date = self._format_chinese_data(num) |
|
return chinese_date |
|
|
|
|
|
class LangSegment: |
|
|
|
def __init__(self): |
|
|
|
self.langid = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True) |
|
|
|
self._text_cache = None |
|
self._text_lasts = None |
|
self._text_langs = None |
|
self._lang_count = None |
|
self._lang_eos = None |
|
|
|
|
|
|
|
|
|
self.SYMBOLS_PATTERN = r'(<([a-zA-Z|-]*)>(.*?)<\/*[a-zA-Z|-]*>)' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.DEFAULT_FILTERS = ["zh", "ja", "ko", "en"] |
|
|
|
|
|
self.Langfilters = self.DEFAULT_FILTERS[:] |
|
|
|
|
|
self.isLangMerge = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
self.EnablePreview = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.LangPriorityThreshold = 0.89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.keepPinyin = False |
|
|
|
|
|
self.PARSE_TAG = re.compile(r'(⑥\$*\d+[\d]{6,}⑥)') |
|
|
|
self.LangSSML = LangSSML() |
|
|
|
def _clears(self): |
|
self._text_cache = None |
|
self._text_lasts = None |
|
self._text_langs = None |
|
self._text_waits = None |
|
self._lang_count = None |
|
self._lang_eos = None |
|
|
|
def _is_english_word(self, word): |
|
return bool(re.match(r'^[a-zA-Z]+$', word)) |
|
|
|
def _is_chinese(self, word): |
|
for char in word: |
|
if '\u4e00' <= char <= '\u9fff': |
|
return True |
|
return False |
|
|
|
def _is_japanese_kana(self, word): |
|
pattern = re.compile(r'[\u3040-\u309F\u30A0-\u30FF]+') |
|
matches = pattern.findall(word) |
|
return len(matches) > 0 |
|
|
|
def _insert_english_uppercase(self, word): |
|
modified_text = re.sub(r'(?<!\b)([A-Z])', r' \1', word) |
|
modified_text = modified_text.strip('-') |
|
return modified_text + " " |
|
|
|
def _split_camel_case(self, word): |
|
return re.sub(r'(?<!^)(?=[A-Z])', ' ', word) |
|
|
|
def _statistics(self, language, text): |
|
|
|
|
|
if self._lang_count is None or not isinstance(self._lang_count, defaultdict): |
|
self._lang_count = defaultdict(int) |
|
lang_count = self._lang_count |
|
if not "|" in language: |
|
lang_count[language] += int(len(text)*2) if language == "zh" else len(text) |
|
self._lang_count = lang_count |
|
|
|
def _clear_text_number(self, text): |
|
if text == "\n":return text,False |
|
clear_text = re.sub(r'([^\w\s]+)','',re.sub(r'\n+','',text)).strip() |
|
is_number = len(re.sub(re.compile(r'(\d+)'),'',clear_text)) == 0 |
|
return clear_text,is_number |
|
|
|
def _saveData(self, words,language:str,text:str,score:float,symbol=None): |
|
|
|
clear_text , is_number = self._clear_text_number(text) |
|
|
|
preData = words[-1] if len(words) > 0 else None |
|
if symbol is not None:pass |
|
elif preData is not None and preData["symbol"] is None: |
|
if len(clear_text) == 0:language = preData["lang"] |
|
elif is_number == True:language = preData["lang"] |
|
_ , pre_is_number = self._clear_text_number(preData["text"]) |
|
if (preData["lang"] == language): |
|
self._statistics(preData["lang"],text) |
|
text = preData["text"] + text |
|
preData["text"] = text |
|
return preData |
|
elif pre_is_number == True: |
|
text = f'{preData["text"]}{text}' |
|
words.pop() |
|
elif is_number == True: |
|
priority_language = self._get_filters_string()[:2] |
|
if priority_language in "ja-zh-en-ko-fr-vi":language = priority_language |
|
data = {"lang":language,"text": text,"score":score,"symbol":symbol} |
|
filters = self.Langfilters |
|
if filters is None or len(filters) == 0 or "?" in language or \ |
|
language in filters or language in filters[0] or \ |
|
filters[0] == "*" or filters[0] in "alls-mixs-autos": |
|
words.append(data) |
|
self._statistics(data["lang"],data["text"]) |
|
return data |
|
|
|
def _addwords(self, words,language,text,score,symbol=None): |
|
if text == "\n":pass |
|
elif text is None or len(text.strip()) == 0:return True |
|
if language is None:language = "" |
|
language = language.lower() |
|
if language == 'en':text = self._insert_english_uppercase(text) |
|
|
|
text_waits = self._text_waits |
|
ispre_waits = len(text_waits)>0 |
|
preResult = text_waits.pop() if ispre_waits else None |
|
if preResult is None:preResult = words[-1] if len(words) > 0 else None |
|
if preResult and ("|" in preResult["lang"]): |
|
pre_lang = preResult["lang"] |
|
if language in pre_lang:preResult["lang"] = language = language.split("|")[0] |
|
else:preResult["lang"]=pre_lang.split("|")[0] |
|
if ispre_waits:preResult = self._saveData(words,preResult["lang"],preResult["text"],preResult["score"],preResult["symbol"]) |
|
pre_lang = preResult["lang"] if preResult else None |
|
if ("|" in language) and (pre_lang and not pre_lang in language and not "…" in language):language = language.split("|")[0] |
|
if "|" in language:self._text_waits.append({"lang":language,"text": text,"score":score,"symbol":symbol}) |
|
else:self._saveData(words,language,text,score,symbol) |
|
return False |
|
|
|
def _get_prev_data(self, words): |
|
data = words[-1] if words and len(words) > 0 else None |
|
if data:return (data["lang"] , data["text"]) |
|
return (None,"") |
|
|
|
def _match_ending(self, input , index): |
|
if input is None or len(input) == 0:return False,None |
|
input = re.sub(r'\s+', '', input) |
|
if len(input) == 0 or abs(index) > len(input):return False,None |
|
ending_pattern = re.compile(r'([「」“”‘’"\'::。.!!?.?])') |
|
return ending_pattern.match(input[index]),input[index] |
|
|
|
def _cleans_text(self, cleans_text): |
|
cleans_text = re.sub(r'(.*?)([^\w]+)', r'\1 ', cleans_text) |
|
cleans_text = re.sub(r'(.)\1+', r'\1', cleans_text) |
|
return cleans_text.strip() |
|
|
|
def _mean_processing(self, text:str): |
|
if text is None or (text.strip()) == "":return None , 0.0 |
|
arrs = self._split_camel_case(text).split(" ") |
|
langs = [] |
|
for t in arrs: |
|
if len(t.strip()) <= 3:continue |
|
language, score = self.langid.classify(t) |
|
langs.append({"lang":language}) |
|
if len(langs) == 0:return None , 0.0 |
|
return Counter([item['lang'] for item in langs]).most_common(1)[0][0],1.0 |
|
|
|
def _lang_classify(self, cleans_text): |
|
language, score = self.langid.classify(cleans_text) |
|
|
|
if score is not None and isinstance(score, np.generic) and hasattr(score,"item"): |
|
score = score.item() |
|
score = round(score , 3) |
|
return language, score |
|
|
|
def _get_filters_string(self): |
|
filters = self.Langfilters |
|
return "-".join(filters).lower().strip() if filters is not None else "" |
|
|
|
def _parse_language(self, words , segment): |
|
LANG_JA = "ja" |
|
LANG_ZH = "zh" |
|
LANG_ZH_JA = f'{LANG_ZH}|{LANG_JA}' |
|
LANG_JA_ZH = f'{LANG_JA}|{LANG_ZH}' |
|
language = LANG_ZH |
|
regex_pattern = re.compile(r'([^\w\s]+)') |
|
lines = regex_pattern.split(segment) |
|
lines_max = len(lines) |
|
LANG_EOS =self._lang_eos |
|
for index, text in enumerate(lines): |
|
if len(text) == 0:continue |
|
EOS = index >= (lines_max - 1) |
|
nextId = index + 1 |
|
nextText = lines[nextId] if not EOS else "" |
|
nextPunc = len(re.sub(regex_pattern,'',re.sub(r'\n+','',nextText)).strip()) == 0 |
|
textPunc = len(re.sub(regex_pattern,'',re.sub(r'\n+','',text)).strip()) == 0 |
|
if not EOS and (textPunc == True or ( len(nextText.strip()) >= 0 and nextPunc == True)): |
|
lines[nextId] = f'{text}{nextText}' |
|
continue |
|
number_tags = re.compile(r'(⑥\d{6,}⑥)') |
|
cleans_text = re.sub(number_tags, '' ,text) |
|
cleans_text = re.sub(r'\d+', '' ,cleans_text) |
|
cleans_text = self._cleans_text(cleans_text) |
|
|
|
if not EOS and len(cleans_text) <= 2: |
|
lines[nextId] = f'{text}{nextText}' |
|
continue |
|
language,score = self._lang_classify(cleans_text) |
|
prev_language , prev_text = self._get_prev_data(words) |
|
if language != LANG_ZH and all('\u4e00' <= c <= '\u9fff' for c in re.sub(r'\s','',cleans_text)):language,score = LANG_ZH,1 |
|
if len(cleans_text) <= 5 and self._is_chinese(cleans_text): |
|
filters_string = self._get_filters_string() |
|
if score < self.LangPriorityThreshold and len(filters_string) > 0: |
|
index_ja , index_zh = filters_string.find(LANG_JA) , filters_string.find(LANG_ZH) |
|
if index_ja != -1 and index_ja < index_zh:language = LANG_JA |
|
elif index_zh != -1 and index_zh < index_ja:language = LANG_ZH |
|
if self._is_japanese_kana(cleans_text):language = LANG_JA |
|
elif len(cleans_text) > 2 and score > 0.90:pass |
|
elif EOS and LANG_EOS:language = LANG_ZH if len(cleans_text) <= 1 else language |
|
else: |
|
LANG_UNKNOWN = LANG_ZH_JA if language == LANG_ZH or (len(cleans_text) <=2 and prev_language == LANG_ZH) else LANG_JA_ZH |
|
match_end,match_char = self._match_ending(text, -1) |
|
referen = prev_language in LANG_UNKNOWN or LANG_UNKNOWN in prev_language if prev_language else False |
|
if match_char in "。.": language = prev_language if referen and len(words) > 0 else language |
|
else:language = f"{LANG_UNKNOWN}|…" |
|
text,*_ = re.subn(number_tags , self._restore_number , text ) |
|
self._addwords(words,language,text,score) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _process_symbol_SSML(self, words,data): |
|
tag , match = data |
|
language = SSML = match[1] |
|
text = match[2] |
|
score = 1.0 |
|
if SSML == "telephone": |
|
|
|
language = "zh" |
|
text = self.LangSSML.to_chinese_telephone(text) |
|
elif SSML == "number": |
|
|
|
language = "zh" |
|
text = self.LangSSML.to_chinese_number(text) |
|
elif SSML == "currency": |
|
|
|
language = "zh" |
|
text = self.LangSSML.to_chinese_currency(text) |
|
elif SSML == "date": |
|
|
|
language = "zh" |
|
text = self.LangSSML.to_chinese_date(text) |
|
self._addwords(words,language,text,score,SSML) |
|
|
|
|
|
def _restore_number(self, matche): |
|
value = matche.group(0) |
|
text_cache = self._text_cache |
|
if value in text_cache: |
|
process , data = text_cache[value] |
|
tag , match = data |
|
value = match |
|
return value |
|
|
|
def _pattern_symbols(self, item , text): |
|
if text is None:return text |
|
tag , pattern , process = item |
|
matches = pattern.findall(text) |
|
if len(matches) == 1 and "".join(matches[0]) == text: |
|
return text |
|
for i , match in enumerate(matches): |
|
key = f"⑥{tag}{i:06d}⑥" |
|
text = re.sub(pattern , key , text , count=1) |
|
self._text_cache[key] = (process , (tag , match)) |
|
return text |
|
|
|
def _process_symbol(self, words,data): |
|
tag , match = data |
|
language = match[1] |
|
text = match[2] |
|
score = 1.0 |
|
filters = self._get_filters_string() |
|
if language not in filters: |
|
self._process_symbol_SSML(words,data) |
|
else: |
|
self._addwords(words,language,text,score,True) |
|
|
|
def _process_english(self, words,data): |
|
tag , match = data |
|
text = match[0] |
|
filters = self._get_filters_string() |
|
priority_language = filters[:2] |
|
|
|
enablePreview = self.EnablePreview |
|
if enablePreview == True: |
|
|
|
regex_pattern = re.compile(r'(.*?[。.??!!]+[\n]{,1})') |
|
lines = regex_pattern.split(text) |
|
for index , text in enumerate(lines): |
|
if len(text.strip()) == 0:continue |
|
cleans_text = self._cleans_text(text) |
|
language,score = self._lang_classify(cleans_text) |
|
if language not in filters: |
|
language,score = self._mean_processing(cleans_text) |
|
if language is None or score <= 0.0:continue |
|
elif language in filters:pass |
|
elif score >= 0.95:continue |
|
elif score <= 0.15 and filters[:2] == "fr":language = priority_language |
|
else:language = "en" |
|
self._addwords(words,language,text,score) |
|
else: |
|
|
|
language, score = "en", 1.0 |
|
self._addwords(words,language,text,score) |
|
|
|
def _process_Russian(self, words,data): |
|
tag , match = data |
|
text = match[0] |
|
language = "ru" |
|
score = 1.0 |
|
self._addwords(words,language,text,score) |
|
|
|
def _process_Thai(self, words,data): |
|
tag , match = data |
|
text = match[0] |
|
language = "th" |
|
score = 1.0 |
|
self._addwords(words,language,text,score) |
|
|
|
def _process_korean(self, words,data): |
|
tag , match = data |
|
text = match[0] |
|
language = "ko" |
|
score = 1.0 |
|
self._addwords(words,language,text,score) |
|
|
|
def _process_quotes(self, words,data): |
|
tag , match = data |
|
text = "".join(match) |
|
childs = self.PARSE_TAG.findall(text) |
|
if len(childs) > 0: |
|
self._process_tags(words , text , False) |
|
else: |
|
cleans_text = self._cleans_text(match[1]) |
|
if len(cleans_text) <= 5: |
|
self._parse_language(words,text) |
|
else: |
|
language,score = self._lang_classify(cleans_text) |
|
self._addwords(words,language,text,score) |
|
|
|
def _process_pinyin(self, words,data): |
|
tag , match = data |
|
text = match |
|
language = "zh" |
|
score = 1.0 |
|
self._addwords(words,language,text,score) |
|
|
|
def _process_number(self, words,data): |
|
""" |
|
Numbers alone cannot accurately identify language. |
|
Because numbers are universal in all languages. |
|
So it won't be executed here, just for testing. |
|
""" |
|
tag , match = data |
|
language = words[0]["lang"] if len(words) > 0 else "zh" |
|
text = match |
|
score = 0.0 |
|
self._addwords(words,language,text,score) |
|
|
|
def _process_tags(self, words , text , root_tag): |
|
text_cache = self._text_cache |
|
segments = re.split(self.PARSE_TAG, text) |
|
segments_len = len(segments) - 1 |
|
for index , text in enumerate(segments): |
|
if root_tag:self._lang_eos = index >= segments_len |
|
if self.PARSE_TAG.match(text): |
|
process , data = text_cache[text] |
|
if process:process(words , data) |
|
else: |
|
self._parse_language(words , text) |
|
return words |
|
|
|
def _merge_results(self, words): |
|
new_word = [] |
|
for index , cur_data in enumerate(words): |
|
if "symbol" in cur_data:del cur_data["symbol"] |
|
if index == 0:new_word.append(cur_data) |
|
else: |
|
pre_data = new_word[-1] |
|
if cur_data["lang"] == pre_data["lang"]: |
|
pre_data["text"] = f'{pre_data["text"]}{cur_data["text"]}' |
|
else:new_word.append(cur_data) |
|
return new_word |
|
|
|
def _parse_symbols(self, text): |
|
TAG_NUM = "00" |
|
TAG_S1,TAG_S2,TAG_P1,TAG_P2,TAG_EN,TAG_KO,TAG_RU,TAG_TH = "$1" ,"$2" ,"$3" ,"$4" ,"$5" ,"$6" ,"$7","$8" |
|
TAG_BASE = re.compile(fr'(([【《((“‘"\']*[LANGUAGE]+[\W\s]*)+)') |
|
|
|
filters = self.Langfilters |
|
filters = filters if filters is not None else "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
enablePreview = self.EnablePreview |
|
if "fr" in filters or \ |
|
"vi" in filters:enablePreview = True |
|
self.EnablePreview = enablePreview |
|
|
|
RE_FR = "" if not enablePreview else "àáâãäåæçèéêëìíîïðñòóôõöùúûüýþÿ" |
|
|
|
RE_VI = "" if not enablePreview else "đơưăáàảãạắằẳẵặấầẩẫậéèẻẽẹếềểễệíìỉĩịóòỏõọốồổỗộớờởỡợúùủũụứừửữựôâêơưỷỹ" |
|
|
|
|
|
process_list = [ |
|
( TAG_S1 , re.compile(self.SYMBOLS_PATTERN) , self._process_symbol ), |
|
( TAG_KO , re.compile(re.sub(r'LANGUAGE',f'\uac00-\ud7a3',TAG_BASE.pattern)) , self._process_korean ), |
|
( TAG_TH , re.compile(re.sub(r'LANGUAGE',f'\u0E00-\u0E7F',TAG_BASE.pattern)) , self._process_Thai ), |
|
( TAG_RU , re.compile(re.sub(r'LANGUAGE',f'А-Яа-яЁё',TAG_BASE.pattern)) , self._process_Russian ), |
|
( TAG_NUM , re.compile(r'(\W*\d+\W+\d*\W*\d*)') , self._process_number ), |
|
( TAG_EN , re.compile(re.sub(r'LANGUAGE',f'a-zA-Z{RE_FR}{RE_VI}',TAG_BASE.pattern)) , self._process_english ), |
|
( TAG_P1 , re.compile(r'(["\'])(.*?)(\1)') , self._process_quotes ), |
|
( TAG_P2 , re.compile(r'([\n]*[【《((“‘])([^【《((“‘’”))》】]{3,})([’”))》】][\W\s]*[\n]{,1})') , self._process_quotes ), |
|
] |
|
|
|
if self.keepPinyin == True:process_list.insert(1 , |
|
( TAG_S2 , re.compile(r'([\(({](?:\s*\w*\d\w*\s*)+[})\)])') , self._process_pinyin ), |
|
) |
|
|
|
words = [] |
|
lines = re.findall(r'.*\n*', re.sub(self.PARSE_TAG, '' ,text)) |
|
for index , text in enumerate(lines): |
|
if len(text.strip()) == 0:continue |
|
self._lang_eos = False |
|
self._text_cache = {} |
|
for item in process_list: |
|
text = self._pattern_symbols(item , text) |
|
cur_word = self._process_tags([] , text , True) |
|
if len(cur_word) == 0:continue |
|
cur_data = cur_word[0] if len(cur_word) > 0 else None |
|
pre_data = words[-1] if len(words) > 0 else None |
|
if cur_data and pre_data and cur_data["lang"] == pre_data["lang"] \ |
|
and cur_data["symbol"] == False and pre_data["symbol"] : |
|
cur_data["text"] = f'{pre_data["text"]}{cur_data["text"]}' |
|
words.pop() |
|
words += cur_word |
|
if self.isLangMerge == True:words = self._merge_results(words) |
|
lang_count = self._lang_count |
|
if lang_count and len(lang_count) > 0: |
|
lang_count = dict(sorted(lang_count.items(), key=lambda x: x[1], reverse=True)) |
|
lang_count = list(lang_count.items()) |
|
self._lang_count = lang_count |
|
return words |
|
|
|
def setfilters(self, filters): |
|
|
|
|
|
|
|
|
|
if self.Langfilters != filters: |
|
self._clears() |
|
self.Langfilters = filters |
|
|
|
def getfilters(self): |
|
return self.Langfilters |
|
|
|
def setPriorityThreshold(self, threshold:float): |
|
self.LangPriorityThreshold = threshold |
|
|
|
def getPriorityThreshold(self): |
|
return self.LangPriorityThreshold |
|
|
|
def getCounts(self): |
|
lang_count = self._lang_count |
|
if lang_count is not None:return lang_count |
|
text_langs = self._text_langs |
|
if text_langs is None or len(text_langs) == 0:return [("zh",0)] |
|
lang_counts = defaultdict(int) |
|
for d in text_langs:lang_counts[d['lang']] += int(len(d['text'])*2) if d['lang'] == "zh" else len(d['text']) |
|
lang_counts = dict(sorted(lang_counts.items(), key=lambda x: x[1], reverse=True)) |
|
lang_counts = list(lang_counts.items()) |
|
self._lang_count = lang_counts |
|
return lang_counts |
|
|
|
def getTexts(self, text:str): |
|
if text is None or len(text.strip()) == 0: |
|
self._clears() |
|
return [] |
|
|
|
text_langs = self._text_langs |
|
if self._text_lasts == text and text_langs is not None:return text_langs |
|
|
|
self._text_waits = [] |
|
self._lang_count = None |
|
self._text_lasts = text |
|
text = self._parse_symbols(text) |
|
self._text_langs = text |
|
return text |
|
|
|
def classify(self, text:str): |
|
return self.getTexts(text) |
|
|
|
def printList(langlist): |
|
""" |
|
功能:打印数组结果 |
|
기능: 어레이 결과 인쇄 |
|
機能:配列結果を印刷 |
|
Function: Print array results |
|
""" |
|
print("\n===================【打印结果】===================") |
|
if langlist is None or len(langlist) == 0: |
|
print("无内容结果,No content result") |
|
return |
|
for line in langlist: |
|
print(line) |
|
pass |
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
langsegment = LangSegment() |
|
langsegment.setfilters(["fr", "vi" , "ja", "zh", "ko", "en" , "ru" , "th"]) |
|
text = """ |
|
我喜欢在雨天里听音乐。 |
|
I enjoy listening to music on rainy days. |
|
雨の日に音楽を聴くのが好きです。 |
|
비 오는 날에 음악을 듣는 것을 즐깁니다。 |
|
J'aime écouter de la musique les jours de pluie. |
|
Tôi thích nghe nhạc vào những ngày mưa. |
|
Мне нравится слушать музыку в дождливую погоду. |
|
ฉันชอบฟังเพลงในวันที่ฝนตก |
|
""" |
|
|
|
|
|
|
|
|
|
langlist = langsegment.getTexts(text) |
|
printList(langlist) |
|
|
|
|
|
|
|
print("\n===================【语种统计】===================") |
|
|
|
|
|
langCounts = langsegment.getCounts() |
|
print(langCounts , "\n") |
|
|
|
|
|
|
|
lang , count = langCounts[0] |
|
print(f"输入内容的主要语言为 = {lang} ,字数 = {count}") |
|
print("==================================================\n") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|