laurievb commited on
Commit
9c82414
1 Parent(s): 1aa019d

Upload scripts/openlid.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/openlid.py +87 -0
scripts/openlid.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unicodedata
2
+ import emoji
3
+ import sys
4
+
5
+ class Demojizer:
6
+ """
7
+ based on:
8
+ https://github.com/carpedm20/emoji/blob/d8bbfe455c6fcd12b96ed1dce6e0978fe7a47431/emoji/core.py#L141
9
+ """
10
+
11
+ def _get_search_tree(self):
12
+ _SEARCH_TREE = {}
13
+ for emj in emoji.unicode_codes.EMOJI_DATA:
14
+ sub_tree = _SEARCH_TREE
15
+ lastidx = len(emj) - 1
16
+ for i, char in enumerate(emj):
17
+ if char not in sub_tree:
18
+ sub_tree[char] = {}
19
+ sub_tree = sub_tree[char]
20
+ if i == lastidx:
21
+ sub_tree["data"] = emoji.unicode_codes.EMOJI_DATA[emj]
22
+ return _SEARCH_TREE
23
+
24
+ def __init__(self) -> None:
25
+ self.search_tree = self._get_search_tree()
26
+
27
+ def __call__(self, string: str, replace_str: str):
28
+ result = []
29
+ i = 0
30
+ length = len(string)
31
+ state = 0
32
+ while i < length:
33
+ consumed = False
34
+ char = string[i]
35
+ if char in self.search_tree:
36
+ j = i + 1
37
+ sub_tree = self.search_tree[char]
38
+ while j < length and string[j] in sub_tree:
39
+ sub_tree = sub_tree[string[j]]
40
+ j += 1
41
+ if "data" in sub_tree:
42
+ state = 1
43
+ consumed = True
44
+ result.append(replace_str)
45
+ i = j - 1
46
+ else:
47
+ state = 0
48
+ elif state == 1:
49
+ if char.isspace():
50
+ consumed = True
51
+ else:
52
+ state = 0
53
+
54
+ if not consumed and char != "\ufe0e" and char != "\ufe0f":
55
+ result.append(char)
56
+ i += 1
57
+
58
+ return "".join(result)
59
+
60
+
61
+ def _get_replacer(replace_by: str = " ") -> str:
62
+ non_printable_map = {
63
+ ord(c): replace_by
64
+ for c in (chr(i) for i in range(sys.maxunicode + 1))
65
+ # same as \p{C} in perl
66
+ # see https://www.unicode.org/reports/tr44/#General_Category_Values
67
+ if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
68
+ }
69
+
70
+ def replace_non_printing_char(line) -> str:
71
+ return line.translate(non_printable_map)
72
+
73
+ return replace_non_printing_char
74
+
75
+
76
+ def clean_text(input_text: str) -> str:
77
+ """cleans input text prior to LID"""
78
+ replace_nonprint = _get_replacer(" ")
79
+ demoji = Demojizer()
80
+
81
+ clean = replace_nonprint(input_text)
82
+ clean = unicodedata.normalize("NFKC", clean)
83
+ clean = demoji(clean, "")
84
+
85
+ return clean
86
+
87
+