Spaces:

edugp
/

perplexity-lenses

Runtime error

App Files Files Community

edugp commited on Nov 11, 2021

Commit

0def03f

1 Parent(s): ab846df

Replicate default cc_net preprocessing at inference time on KenlmModel.get_perplexity

Browse files

Files changed (1) hide show

perplexity_lenses/perplexity.py +88 -1

perplexity_lenses/perplexity.py CHANGED Viewed

@@ -1,10 +1,53 @@
 import os
 import urllib.request
 import kenlm
 class KenlmModel:
     def __init__(self, language):
         download_kenlm_model(language)
         try:
@@ -19,7 +62,9 @@ class KenlmModel:
     def from_pretrained(cls, language: str):
         return cls(language)
-    def get_perplexity(self, doc: str):
         doc_log_score, doc_length = 0, 0
         for line in doc.split("\n"):
             log_score = self.model.score(line)
@@ -28,6 +73,48 @@ class KenlmModel:
             doc_length += length
         return 10.0 ** (-doc_log_score / doc_length)
 def download_kenlm_model(language: str):
     root_url = "http://dl.fbaipublicfiles.com/cc_net/lm"

 import os
+import re
+import unicodedata
 import urllib.request
+from typing import Dict
 import kenlm
 class KenlmModel:
+    digit_re: re.Pattern = re.compile(r"\d")
+    unicode_punct: Dict[str, str] = {
+        "，": ",",
+        "。": ".",
+        "、": ",",
+        "„": '"',
+        "”": '"',
+        "“": '"',
+        "«": '"',
+        "»": '"',
+        "１": '"',
+        "」": '"',
+        "「": '"',
+        "《": '"',
+        "》": '"',
+        "´": "'",
+        "∶": ":",
+        "：": ":",
+        "？": "?",
+        "！": "!",
+        "（": "(",
+        "）": ")",
+        "；": ";",
+        "–": "-",
+        "—": " - ",
+        "．": ". ",
+        "～": "~",
+        "’": "'",
+        "…": "...",
+        "━": "-",
+        "〈": "<",
+        "〉": ">",
+        "【": "[",
+        "】": "]",
+        "％": "%",
+        "►": "-",
+    }
+    unicode_punct_re = re.compile(f"[{''.join(unicode_punct.keys())}]")
+    non_printing_chars_re = re.compile(f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]")
     def __init__(self, language):
         download_kenlm_model(language)
         try:
     def from_pretrained(cls, language: str):
         return cls(language)
+    def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
+        if normalize_cc_net:
+            doc = self.normalize(doc)
         doc_log_score, doc_length = 0, 0
         for line in doc.split("\n"):
             log_score = self.model.score(line)
             doc_length += length
         return 10.0 ** (-doc_log_score / doc_length)
+    def normalize(
+        self,
+        line: str,
+        accent: bool = True,
+        case: bool = True,
+        numbers: bool = True,
+        punct: int = 1,
+    ) -> str:
+        line = line.strip()
+        if not line:
+            return line
+        if case:
+            line = line.lower()
+        if accent:
+            line = self.strip_accents(line)
+        if numbers:
+            line = self.digit_re.sub("0", line)
+        if punct == 1:
+            line = self.replace_unicode_punct(line)
+        elif punct == 2:
+            line = self.remove_unicode_punct(line)
+        line = self.remove_non_printing_char(line)
+        return line
+    def strip_accents(self, line: str) -> str:
+        """Strips accents from a piece of text."""
+        nfd = unicodedata.normalize("NFD", line)
+        output = [c for c in nfd if unicodedata.category(c) != "Mn"]
+        if len(output) == line:
+            return line
+        return "".join(output)
+    def replace_unicode_punct(self, text: str) -> str:
+        return "".join((self.unicode_punct.get(c, c) for c in text))
+    def remove_unicode_punct(self, text: str) -> str:
+        """More aggressive version of replace_unicode_punct but also faster."""
+        return self.unicode_punct_re.sub("", text)
+    def remove_non_printing_char(self, text: str) -> str:
+        return self.non_printing_chars_re.sub("", text)
 def download_kenlm_model(language: str):
     root_url = "http://dl.fbaipublicfiles.com/cc_net/lm"