Spaces:
Sleeping
Sleeping
from collections import defaultdict | |
import json | |
from itertools import product | |
import os | |
import unicodedata | |
STRATEGY_LOAD = 1 # 加载类别 | |
STRATEGY_IGNORE = 2 # 对结果添加字符 | |
STRATEGY_REMOVE = 3 # 对结果移除字符 | |
ASCII_RANGE = range(128) | |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
DATA_LOCATION = os.path.join(CURRENT_DIR, "homoglyph_data") | |
class Categories: | |
fpath = os.path.join(DATA_LOCATION, "categories.json") | |
def _get_ranges(cls, categories): | |
with open(cls.fpath, encoding="utf-8") as f: | |
data = json.load(f) | |
for category in categories: | |
if category not in data["aliases"]: | |
raise ValueError("Invalid category: {}".format(category)) | |
for point in data["points"]: | |
if point[2] in categories: | |
yield point[:2] | |
def get_alphabet(cls, categories): | |
alphabet = set() | |
for start, end in cls._get_ranges(categories): | |
chars = (chr(code) for code in range(start, end + 1)) | |
alphabet.update(chars) | |
return alphabet | |
def detect(cls, char): | |
""" | |
:return: category | |
:rtype: str | |
""" | |
with open(cls.fpath, encoding="utf-8") as f: | |
data = json.load(f) | |
# 尝试用unicodedata检测类别 | |
try: | |
category = unicodedata.name(char).split()[0] | |
except (TypeError, ValueError): | |
pass | |
else: | |
if category in data["aliases"]: | |
return category | |
# 尝试从JSON文件中按范围检测类别 | |
code = ord(char) | |
for point in data["points"]: | |
if point[0] <= code <= point[1]: | |
return point[2] | |
def get_all(cls): | |
with open(cls.fpath, encoding="utf-8") as f: | |
data = json.load(f) | |
return set(data["aliases"]) | |
class Languages: | |
fpath = os.path.join(DATA_LOCATION, "languages.json") | |
def get_alphabet(cls, languages): | |
""" | |
:return: set of chars in alphabet by languages list | |
:rtype: set | |
""" | |
with open(cls.fpath, encoding="utf-8") as f: | |
data = json.load(f) | |
alphabet = set() | |
for lang in languages: | |
if lang not in data: | |
raise ValueError("Invalid language code: {}".format(lang)) | |
alphabet.update(data[lang]) | |
return alphabet | |
def detect(cls, char): | |
""" | |
:return: set of languages which alphabet contains passed char. | |
:rtype: set | |
""" | |
with open(cls.fpath, encoding="utf-8") as f: | |
data = json.load(f) | |
languages = set() | |
for lang, alphabet in data.items(): | |
if char in alphabet: | |
languages.add(lang) | |
return languages | |
def get_all(cls): | |
with open(cls.fpath, encoding="utf-8") as f: | |
data = json.load(f) | |
return set(data.keys()) | |
class Homoglyphs: | |
def __init__( | |
self, | |
categories=None, | |
languages=None, | |
alphabet=None, | |
strategy=STRATEGY_IGNORE, | |
ascii_strategy=STRATEGY_IGNORE, | |
ascii_range=ASCII_RANGE, | |
): | |
# strategies | |
if strategy not in (STRATEGY_LOAD, STRATEGY_IGNORE, STRATEGY_REMOVE): | |
raise ValueError("Invalid strategy") | |
self.strategy = strategy | |
self.ascii_strategy = ascii_strategy | |
self.ascii_range = ascii_range | |
# Homoglyphs必须由任何字母表初始化才能正确工作 | |
if not categories and not languages and not alphabet: | |
categories = ("LATIN", "COMMON") | |
# cats and langs | |
self.categories = set(categories or []) | |
self.languages = set(languages or []) | |
# alphabet | |
self.alphabet = set(alphabet or []) | |
if self.categories: | |
alphabet = Categories.get_alphabet(self.categories) | |
self.alphabet.update(alphabet) | |
if self.languages: | |
alphabet = Languages.get_alphabet(self.languages) | |
self.alphabet.update(alphabet) | |
self.table = self.get_table(self.alphabet) | |
def get_table(alphabet): | |
table = defaultdict(set) | |
with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f: | |
data = json.load(f) | |
for char in alphabet: | |
if char in data: | |
for homoglyph in data[char]: | |
if homoglyph in alphabet: | |
table[char].add(homoglyph) | |
return table | |
def get_restricted_table(source_alphabet, target_alphabet): | |
table = defaultdict(set) | |
with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f: | |
data = json.load(f) | |
for char in source_alphabet: | |
if char in data: | |
for homoglyph in data[char]: | |
if homoglyph in target_alphabet: | |
table[char].add(homoglyph) | |
return table | |
def uniq_and_sort(data): | |
result = list(set(data)) | |
result.sort(key=lambda x: (-len(x), x)) | |
return result | |
def _update_alphabet(self, char): | |
# 尝试检测语言 | |
langs = Languages.detect(char) | |
if langs: | |
self.languages.update(langs) | |
alphabet = Languages.get_alphabet(langs) | |
self.alphabet.update(alphabet) | |
else: | |
# 尝试检测类别 | |
category = Categories.detect(char) | |
if category is None: | |
return False | |
self.categories.add(category) | |
alphabet = Categories.get_alphabet([category]) | |
self.alphabet.update(alphabet) | |
# 更新新字母表的表格 | |
self.table = self.get_table(self.alphabet) | |
return True | |
def _get_char_variants(self, char): | |
if char not in self.alphabet: | |
if self.strategy == STRATEGY_LOAD: | |
if not self._update_alphabet(char): | |
return [] | |
elif self.strategy == STRATEGY_IGNORE: | |
return [char] | |
elif self.strategy == STRATEGY_REMOVE: | |
return [] | |
# 查找当前字符的替代字符 | |
alt_chars = self.table.get(char, set()) | |
if alt_chars: | |
# 为当前字符查找可选字符 | |
alt_chars2 = [self.table.get(alt_char, set()) for alt_char in alt_chars] | |
# 合并所有备选方案 | |
alt_chars.update(*alt_chars2) | |
# 将当前字符添加到备选项 | |
alt_chars.add(char) | |
# uniq, sort and return | |
return self.uniq_and_sort(alt_chars) | |
def _get_combinations(self, text, ascii=False): | |
variations = [] | |
for char in text: | |
alt_chars = self._get_char_variants(char) | |
if ascii: | |
alt_chars = [char for char in alt_chars if ord(char) in self.ascii_range] | |
if not alt_chars and self.ascii_strategy == STRATEGY_IGNORE: | |
return | |
if alt_chars: | |
variations.append(alt_chars) | |
if variations: | |
for variant in product(*variations): | |
yield "".join(variant) | |
def get_combinations(self, text): | |
return list(self._get_combinations(text)) | |
def _to_ascii(self, text): | |
for variant in self._get_combinations(text, ascii=True): | |
if max(map(ord, variant)) in self.ascii_range: | |
yield variant | |
def to_ascii(self, text): | |
return self.uniq_and_sort(self._to_ascii(text)) | |