laurievb
/

OpenLID-v2

Text Classification

fastText

language-identification

Model card Files Files and versions Community

laurievb commited on 9 days ago

Commit

9c82414

•

1 Parent(s): 1aa019d

Upload scripts/openlid.py with huggingface_hub

Browse files

Files changed (1) hide show

scripts/openlid.py +87 -0

scripts/openlid.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import unicodedata
+import emoji
+import sys
+class Demojizer:
+    """
+    based on:
+    https://github.com/carpedm20/emoji/blob/d8bbfe455c6fcd12b96ed1dce6e0978fe7a47431/emoji/core.py#L141
+    """
+    def _get_search_tree(self):
+        _SEARCH_TREE = {}
+        for emj in emoji.unicode_codes.EMOJI_DATA:
+            sub_tree = _SEARCH_TREE
+            lastidx = len(emj) - 1
+            for i, char in enumerate(emj):
+                if char not in sub_tree:
+                    sub_tree[char] = {}
+                sub_tree = sub_tree[char]
+                if i == lastidx:
+                    sub_tree["data"] = emoji.unicode_codes.EMOJI_DATA[emj]
+        return _SEARCH_TREE
+    def __init__(self) -> None:
+        self.search_tree = self._get_search_tree()
+    def __call__(self, string: str, replace_str: str):
+        result = []
+        i = 0
+        length = len(string)
+        state = 0
+        while i < length:
+            consumed = False
+            char = string[i]
+            if char in self.search_tree:
+                j = i + 1
+                sub_tree = self.search_tree[char]
+                while j < length and string[j] in sub_tree:
+                    sub_tree = sub_tree[string[j]]
+                    j += 1
+                if "data" in sub_tree:
+                    state = 1
+                    consumed = True
+                    result.append(replace_str)
+                    i = j - 1
+                else:
+                    state = 0
+            elif state == 1:
+                if char.isspace():
+                    consumed = True
+                else:
+                    state = 0
+            if not consumed and char != "\ufe0e" and char != "\ufe0f":
+                result.append(char)
+            i += 1
+        return "".join(result)
+def _get_replacer(replace_by: str = " ") -> str:
+    non_printable_map = {
+        ord(c): replace_by
+        for c in (chr(i) for i in range(sys.maxunicode + 1))
+        # same as \p{C} in perl
+        # see https://www.unicode.org/reports/tr44/#General_Category_Values
+        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
+    }
+    def replace_non_printing_char(line) -> str:
+        return line.translate(non_printable_map)
+    return replace_non_printing_char
+def clean_text(input_text: str) -> str:
+    """cleans input text prior to LID"""
+    replace_nonprint = _get_replacer(" ")
+    demoji = Demojizer()
+    clean = replace_nonprint(input_text)
+    clean = unicodedata.normalize("NFKC", clean)
+    clean = demoji(clean, "")
+    return clean