Upload scripts/openlid.py with huggingface_hub
Browse files- scripts/openlid.py +87 -0
scripts/openlid.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unicodedata
|
2 |
+
import emoji
|
3 |
+
import sys
|
4 |
+
|
5 |
+
class Demojizer:
|
6 |
+
"""
|
7 |
+
based on:
|
8 |
+
https://github.com/carpedm20/emoji/blob/d8bbfe455c6fcd12b96ed1dce6e0978fe7a47431/emoji/core.py#L141
|
9 |
+
"""
|
10 |
+
|
11 |
+
def _get_search_tree(self):
|
12 |
+
_SEARCH_TREE = {}
|
13 |
+
for emj in emoji.unicode_codes.EMOJI_DATA:
|
14 |
+
sub_tree = _SEARCH_TREE
|
15 |
+
lastidx = len(emj) - 1
|
16 |
+
for i, char in enumerate(emj):
|
17 |
+
if char not in sub_tree:
|
18 |
+
sub_tree[char] = {}
|
19 |
+
sub_tree = sub_tree[char]
|
20 |
+
if i == lastidx:
|
21 |
+
sub_tree["data"] = emoji.unicode_codes.EMOJI_DATA[emj]
|
22 |
+
return _SEARCH_TREE
|
23 |
+
|
24 |
+
def __init__(self) -> None:
|
25 |
+
self.search_tree = self._get_search_tree()
|
26 |
+
|
27 |
+
def __call__(self, string: str, replace_str: str):
|
28 |
+
result = []
|
29 |
+
i = 0
|
30 |
+
length = len(string)
|
31 |
+
state = 0
|
32 |
+
while i < length:
|
33 |
+
consumed = False
|
34 |
+
char = string[i]
|
35 |
+
if char in self.search_tree:
|
36 |
+
j = i + 1
|
37 |
+
sub_tree = self.search_tree[char]
|
38 |
+
while j < length and string[j] in sub_tree:
|
39 |
+
sub_tree = sub_tree[string[j]]
|
40 |
+
j += 1
|
41 |
+
if "data" in sub_tree:
|
42 |
+
state = 1
|
43 |
+
consumed = True
|
44 |
+
result.append(replace_str)
|
45 |
+
i = j - 1
|
46 |
+
else:
|
47 |
+
state = 0
|
48 |
+
elif state == 1:
|
49 |
+
if char.isspace():
|
50 |
+
consumed = True
|
51 |
+
else:
|
52 |
+
state = 0
|
53 |
+
|
54 |
+
if not consumed and char != "\ufe0e" and char != "\ufe0f":
|
55 |
+
result.append(char)
|
56 |
+
i += 1
|
57 |
+
|
58 |
+
return "".join(result)
|
59 |
+
|
60 |
+
|
61 |
+
def _get_replacer(replace_by: str = " ") -> str:
|
62 |
+
non_printable_map = {
|
63 |
+
ord(c): replace_by
|
64 |
+
for c in (chr(i) for i in range(sys.maxunicode + 1))
|
65 |
+
# same as \p{C} in perl
|
66 |
+
# see https://www.unicode.org/reports/tr44/#General_Category_Values
|
67 |
+
if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
|
68 |
+
}
|
69 |
+
|
70 |
+
def replace_non_printing_char(line) -> str:
|
71 |
+
return line.translate(non_printable_map)
|
72 |
+
|
73 |
+
return replace_non_printing_char
|
74 |
+
|
75 |
+
|
76 |
+
def clean_text(input_text: str) -> str:
|
77 |
+
"""cleans input text prior to LID"""
|
78 |
+
replace_nonprint = _get_replacer(" ")
|
79 |
+
demoji = Demojizer()
|
80 |
+
|
81 |
+
clean = replace_nonprint(input_text)
|
82 |
+
clean = unicodedata.normalize("NFKC", clean)
|
83 |
+
clean = demoji(clean, "")
|
84 |
+
|
85 |
+
return clean
|
86 |
+
|
87 |
+
|