|
import importlib |
|
from codecs import IncrementalDecoder |
|
from collections import Counter |
|
from functools import lru_cache |
|
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple |
|
|
|
from .constant import ( |
|
FREQUENCIES, |
|
KO_NAMES, |
|
LANGUAGE_SUPPORTED_COUNT, |
|
TOO_SMALL_SEQUENCE, |
|
ZH_NAMES, |
|
) |
|
from .md import is_suspiciously_successive_range |
|
from .models import CoherenceMatches |
|
from .utils import ( |
|
is_accentuated, |
|
is_latin, |
|
is_multi_byte_encoding, |
|
is_unicode_range_secondary, |
|
unicode_range, |
|
) |
|
|
|
|
|
def encoding_unicode_range(iana_name: str) -> List[str]: |
|
""" |
|
Return associated unicode ranges in a single byte code page. |
|
""" |
|
if is_multi_byte_encoding(iana_name): |
|
raise IOError("Function not supported on multi-byte code page") |
|
|
|
decoder = importlib.import_module( |
|
"encodings.{}".format(iana_name) |
|
).IncrementalDecoder |
|
|
|
p: IncrementalDecoder = decoder(errors="ignore") |
|
seen_ranges: Dict[str, int] = {} |
|
character_count: int = 0 |
|
|
|
for i in range(0x40, 0xFF): |
|
chunk: str = p.decode(bytes([i])) |
|
|
|
if chunk: |
|
character_range: Optional[str] = unicode_range(chunk) |
|
|
|
if character_range is None: |
|
continue |
|
|
|
if is_unicode_range_secondary(character_range) is False: |
|
if character_range not in seen_ranges: |
|
seen_ranges[character_range] = 0 |
|
seen_ranges[character_range] += 1 |
|
character_count += 1 |
|
|
|
return sorted( |
|
[ |
|
character_range |
|
for character_range in seen_ranges |
|
if seen_ranges[character_range] / character_count >= 0.15 |
|
] |
|
) |
|
|
|
|
|
def unicode_range_languages(primary_range: str) -> List[str]: |
|
""" |
|
Return inferred languages used with a unicode range. |
|
""" |
|
languages: List[str] = [] |
|
|
|
for language, characters in FREQUENCIES.items(): |
|
for character in characters: |
|
if unicode_range(character) == primary_range: |
|
languages.append(language) |
|
break |
|
|
|
return languages |
|
|
|
|
|
@lru_cache() |
|
def encoding_languages(iana_name: str) -> List[str]: |
|
""" |
|
Single-byte encoding language association. Some code page are heavily linked to particular language(s). |
|
This function does the correspondence. |
|
""" |
|
unicode_ranges: List[str] = encoding_unicode_range(iana_name) |
|
primary_range: Optional[str] = None |
|
|
|
for specified_range in unicode_ranges: |
|
if "Latin" not in specified_range: |
|
primary_range = specified_range |
|
break |
|
|
|
if primary_range is None: |
|
return ["Latin Based"] |
|
|
|
return unicode_range_languages(primary_range) |
|
|
|
|
|
@lru_cache() |
|
def mb_encoding_languages(iana_name: str) -> List[str]: |
|
""" |
|
Multi-byte encoding language association. Some code page are heavily linked to particular language(s). |
|
This function does the correspondence. |
|
""" |
|
if ( |
|
iana_name.startswith("shift_") |
|
or iana_name.startswith("iso2022_jp") |
|
or iana_name.startswith("euc_j") |
|
or iana_name == "cp932" |
|
): |
|
return ["Japanese"] |
|
if iana_name.startswith("gb") or iana_name in ZH_NAMES: |
|
return ["Chinese"] |
|
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: |
|
return ["Korean"] |
|
|
|
return [] |
|
|
|
|
|
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) |
|
def get_target_features(language: str) -> Tuple[bool, bool]: |
|
""" |
|
Determine main aspects from a supported language if it contains accents and if is pure Latin. |
|
""" |
|
target_have_accents: bool = False |
|
target_pure_latin: bool = True |
|
|
|
for character in FREQUENCIES[language]: |
|
if not target_have_accents and is_accentuated(character): |
|
target_have_accents = True |
|
if target_pure_latin and is_latin(character) is False: |
|
target_pure_latin = False |
|
|
|
return target_have_accents, target_pure_latin |
|
|
|
|
|
def alphabet_languages( |
|
characters: List[str], ignore_non_latin: bool = False |
|
) -> List[str]: |
|
""" |
|
Return associated languages associated to given characters. |
|
""" |
|
languages: List[Tuple[str, float]] = [] |
|
|
|
source_have_accents = any(is_accentuated(character) for character in characters) |
|
|
|
for language, language_characters in FREQUENCIES.items(): |
|
target_have_accents, target_pure_latin = get_target_features(language) |
|
|
|
if ignore_non_latin and target_pure_latin is False: |
|
continue |
|
|
|
if target_have_accents is False and source_have_accents: |
|
continue |
|
|
|
character_count: int = len(language_characters) |
|
|
|
character_match_count: int = len( |
|
[c for c in language_characters if c in characters] |
|
) |
|
|
|
ratio: float = character_match_count / character_count |
|
|
|
if ratio >= 0.2: |
|
languages.append((language, ratio)) |
|
|
|
languages = sorted(languages, key=lambda x: x[1], reverse=True) |
|
|
|
return [compatible_language[0] for compatible_language in languages] |
|
|
|
|
|
def characters_popularity_compare( |
|
language: str, ordered_characters: List[str] |
|
) -> float: |
|
""" |
|
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. |
|
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). |
|
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) |
|
""" |
|
if language not in FREQUENCIES: |
|
raise ValueError("{} not available".format(language)) |
|
|
|
character_approved_count: int = 0 |
|
FREQUENCIES_language_set = set(FREQUENCIES[language]) |
|
|
|
ordered_characters_count: int = len(ordered_characters) |
|
target_language_characters_count: int = len(FREQUENCIES[language]) |
|
|
|
large_alphabet: bool = target_language_characters_count > 26 |
|
|
|
for character, character_rank in zip( |
|
ordered_characters, range(0, ordered_characters_count) |
|
): |
|
if character not in FREQUENCIES_language_set: |
|
continue |
|
|
|
character_rank_in_language: int = FREQUENCIES[language].index(character) |
|
expected_projection_ratio: float = ( |
|
target_language_characters_count / ordered_characters_count |
|
) |
|
character_rank_projection: int = int(character_rank * expected_projection_ratio) |
|
|
|
if ( |
|
large_alphabet is False |
|
and abs(character_rank_projection - character_rank_in_language) > 4 |
|
): |
|
continue |
|
|
|
if ( |
|
large_alphabet is True |
|
and abs(character_rank_projection - character_rank_in_language) |
|
< target_language_characters_count / 3 |
|
): |
|
character_approved_count += 1 |
|
continue |
|
|
|
characters_before_source: List[str] = FREQUENCIES[language][ |
|
0:character_rank_in_language |
|
] |
|
characters_after_source: List[str] = FREQUENCIES[language][ |
|
character_rank_in_language: |
|
] |
|
characters_before: List[str] = ordered_characters[0:character_rank] |
|
characters_after: List[str] = ordered_characters[character_rank:] |
|
|
|
before_match_count: int = len( |
|
set(characters_before) & set(characters_before_source) |
|
) |
|
|
|
after_match_count: int = len( |
|
set(characters_after) & set(characters_after_source) |
|
) |
|
|
|
if len(characters_before_source) == 0 and before_match_count <= 4: |
|
character_approved_count += 1 |
|
continue |
|
|
|
if len(characters_after_source) == 0 and after_match_count <= 4: |
|
character_approved_count += 1 |
|
continue |
|
|
|
if ( |
|
before_match_count / len(characters_before_source) >= 0.4 |
|
or after_match_count / len(characters_after_source) >= 0.4 |
|
): |
|
character_approved_count += 1 |
|
continue |
|
|
|
return character_approved_count / len(ordered_characters) |
|
|
|
|
|
def alpha_unicode_split(decoded_sequence: str) -> List[str]: |
|
""" |
|
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. |
|
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; |
|
One containing the latin letters and the other hebrew. |
|
""" |
|
layers: Dict[str, str] = {} |
|
|
|
for character in decoded_sequence: |
|
if character.isalpha() is False: |
|
continue |
|
|
|
character_range: Optional[str] = unicode_range(character) |
|
|
|
if character_range is None: |
|
continue |
|
|
|
layer_target_range: Optional[str] = None |
|
|
|
for discovered_range in layers: |
|
if ( |
|
is_suspiciously_successive_range(discovered_range, character_range) |
|
is False |
|
): |
|
layer_target_range = discovered_range |
|
break |
|
|
|
if layer_target_range is None: |
|
layer_target_range = character_range |
|
|
|
if layer_target_range not in layers: |
|
layers[layer_target_range] = character.lower() |
|
continue |
|
|
|
layers[layer_target_range] += character.lower() |
|
|
|
return list(layers.values()) |
|
|
|
|
|
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: |
|
""" |
|
This function merge results previously given by the function coherence_ratio. |
|
The return type is the same as coherence_ratio. |
|
""" |
|
per_language_ratios: Dict[str, List[float]] = {} |
|
for result in results: |
|
for sub_result in result: |
|
language, ratio = sub_result |
|
if language not in per_language_ratios: |
|
per_language_ratios[language] = [ratio] |
|
continue |
|
per_language_ratios[language].append(ratio) |
|
|
|
merge = [ |
|
( |
|
language, |
|
round( |
|
sum(per_language_ratios[language]) / len(per_language_ratios[language]), |
|
4, |
|
), |
|
) |
|
for language in per_language_ratios |
|
] |
|
|
|
return sorted(merge, key=lambda x: x[1], reverse=True) |
|
|
|
|
|
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: |
|
""" |
|
We shall NOT return "English—" in CoherenceMatches because it is an alternative |
|
of "English". This function only keeps the best match and remove the em-dash in it. |
|
""" |
|
index_results: Dict[str, List[float]] = dict() |
|
|
|
for result in results: |
|
language, ratio = result |
|
no_em_name: str = language.replace("—", "") |
|
|
|
if no_em_name not in index_results: |
|
index_results[no_em_name] = [] |
|
|
|
index_results[no_em_name].append(ratio) |
|
|
|
if any(len(index_results[e]) > 1 for e in index_results): |
|
filtered_results: CoherenceMatches = [] |
|
|
|
for language in index_results: |
|
filtered_results.append((language, max(index_results[language]))) |
|
|
|
return filtered_results |
|
|
|
return results |
|
|
|
|
|
@lru_cache(maxsize=2048) |
|
def coherence_ratio( |
|
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None |
|
) -> CoherenceMatches: |
|
""" |
|
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. |
|
A layer = Character extraction by alphabets/ranges. |
|
""" |
|
|
|
results: List[Tuple[str, float]] = [] |
|
ignore_non_latin: bool = False |
|
|
|
sufficient_match_count: int = 0 |
|
|
|
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] |
|
if "Latin Based" in lg_inclusion_list: |
|
ignore_non_latin = True |
|
lg_inclusion_list.remove("Latin Based") |
|
|
|
for layer in alpha_unicode_split(decoded_sequence): |
|
sequence_frequencies: TypeCounter[str] = Counter(layer) |
|
most_common = sequence_frequencies.most_common() |
|
|
|
character_count: int = sum(o for c, o in most_common) |
|
|
|
if character_count <= TOO_SMALL_SEQUENCE: |
|
continue |
|
|
|
popular_character_ordered: List[str] = [c for c, o in most_common] |
|
|
|
for language in lg_inclusion_list or alphabet_languages( |
|
popular_character_ordered, ignore_non_latin |
|
): |
|
ratio: float = characters_popularity_compare( |
|
language, popular_character_ordered |
|
) |
|
|
|
if ratio < threshold: |
|
continue |
|
elif ratio >= 0.8: |
|
sufficient_match_count += 1 |
|
|
|
results.append((language, round(ratio, 4))) |
|
|
|
if sufficient_match_count >= 3: |
|
break |
|
|
|
return sorted( |
|
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True |
|
) |
|
|