import json import os dir = os.path.dirname(__file__) EMOJI_DATA_PATH = os.path.join(dir, "emojis_tr_twitter.json") with open(EMOJI_DATA_PATH, "r") as f: emojis = json.load(f) _SEARCH_TREE = None def _get_search_tree(): global _SEARCH_TREE if _SEARCH_TREE is None: _SEARCH_TREE = {} for emj in emojis: sub_tree = _SEARCH_TREE lastidx = len(emj) - 1 for i, char in enumerate(emj): if char not in sub_tree: sub_tree[char] = {} sub_tree = sub_tree[char] if i == lastidx: sub_tree["data"] = emojis[emj] return _SEARCH_TREE def demojize( string, delimiters=(" ", " "), language="tr", version=None, handle_version=None, ): if language == "alias": language = "tr" _use_aliases = True else: _use_aliases = False tree = _get_search_tree() result = [] i = 0 length = len(string) while i < length: consumed = False char = string[i] if char in tree: j = i + 1 sub_tree = tree[char] while j < length and string[j] in sub_tree: sub_tree = sub_tree[string[j]] j += 1 if "data" in sub_tree: emj_data = sub_tree["data"] code_points = string[i:j] replace_str = None if version is not None and emj_data["E"] > version: if callable(handle_version): emj_data = emj_data.copy() emj_data["match_start"] = i emj_data["match_end"] = j replace_str = handle_version(code_points, emj_data) elif handle_version is not None: replace_str = str(handle_version) else: replace_str = None elif language in emj_data: if _use_aliases and "alias" in emj_data: replace_str = ( delimiters[0] + emj_data["alias"][0][:-1] + delimiters[1] ) else: replace_str = ( delimiters[0] + emj_data[language][1:-1] + delimiters[1] ) else: # The emoji exists, but it is not translated, so we keep the emoji replace_str = code_points i = j - 1 consumed = True if replace_str: result.append(replace_str) if not consumed and char != "\ufe0e" and char != "\ufe0f": result.append(char) i += 1 return "".join(result)