philschmid
/

kenlm

kenlm

perplexity

n-gram

kneser-ney

bigscience

Model card Files Files and versions Community

philschmid HF staff commited on Aug 11, 2023

Commit

fb27ada

•

1 Parent(s): c179430

Delete model.py

Browse files

Files changed (1) hide show

model.py +0 -163

model.py DELETED Viewed

@@ -1,163 +0,0 @@
-import os
-import re
-import unicodedata
-from typing import Dict
-import kenlm
-import sentencepiece
-from huggingface_hub import cached_download, hf_hub_url
-class SentencePiece:
-    def __init__(
-        self,
-        model: str,
-    ):
-        super().__init__()
-        self.sp = sentencepiece.SentencePieceProcessor()
-        self.sp.load(str(model))
-    def do(self, text: dict) -> dict:
-        tokenized = self.sp.encode_as_pieces(text)
-        return " ".join(tokenized)
-class KenlmModel:
-    digit_re: re.Pattern = re.compile(r"\d")
-    unicode_punct: Dict[str, str] = {
-        "，": ",",
-        "。": ".",
-        "、": ",",
-        "„": '"',
-        "”": '"',
-        "“": '"',
-        "«": '"',
-        "»": '"',
-        "１": '"',
-        "」": '"',
-        "「": '"',
-        "《": '"',
-        "》": '"',
-        "´": "'",
-        "∶": ":",
-        "：": ":",
-        "？": "?",
-        "！": "!",
-        "（": "(",
-        "）": ")",
-        "；": ";",
-        "–": "-",
-        "—": " - ",
-        "．": ". ",
-        "～": "~",
-        "’": "'",
-        "…": "...",
-        "━": "-",
-        "〈": "<",
-        "〉": ">",
-        "【": "[",
-        "】": "]",
-        "％": "%",
-        "►": "-",
-    }
-    unicode_punct_re = re.compile(f"[{''.join(unicode_punct.keys())}]")
-    non_printing_chars_re = re.compile(
-        f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
-    )
-    kenlm_model_dir = None
-    sentence_piece_model_dir = None
-    def __init__(
-        self,
-        model_dataset: str,
-        language: str,
-        lower_case: bool = False,
-        remove_accents: bool = False,
-        normalize_numbers: bool = True,
-        punctuation: int = 1,
-    ):
-        self.model = kenlm.Model(os.path.join(model_dataset, f"{language}.arpa.bin"))
-        self.tokenizer = SentencePiece(os.path.join(model_dataset, f"{language}.sp.model"))
-        self.accent = remove_accents
-        self.case = lower_case
-        self.numbers = normalize_numbers
-        self.punct = punctuation
-    @classmethod
-    def from_pretrained(
-        cls,
-        model_dataset: str,
-        language: str,
-    ):
-        return cls(
-            model_dataset,
-            language,
-            False,
-            False,
-            True,
-            1,
-        )
-    def pp(self, log_score, length):
-        return 10.0 ** (-log_score / length)
-    def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
-        if normalize_cc_net:
-            doc = self.normalize(
-                doc,
-                accent=self.accent,
-                case=self.case,
-                numbers=self.numbers,
-                punct=self.punct,
-            )
-        # Tokenize (after normalizing): See https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/mine.py#L352 for full pipeline
-        doc = self.tokenizer.do(doc)
-        doc_log_score, doc_length = 0, 0
-        for line in doc.split("\n"):
-            log_score = self.model.score(line)
-            length = len(line.split()) + 1
-            doc_log_score += log_score
-            doc_length += length
-        return round(self.pp(doc_log_score, doc_length), 1)
-    def normalize(
-        self,
-        line: str,
-        accent: bool = True,
-        case: bool = True,
-        numbers: bool = True,
-        punct: int = 1,
-    ) -> str:
-        line = line.strip()
-        if not line:
-            return line
-        if case:
-            line = line.lower()
-        if accent:
-            line = self.strip_accents(line)
-        if numbers:
-            line = self.digit_re.sub("0", line)
-        if punct == 1:
-            line = self.replace_unicode_punct(line)
-        elif punct == 2:
-            line = self.remove_unicode_punct(line)
-        line = self.remove_non_printing_char(line)
-        return line
-    def strip_accents(self, line: str) -> str:
-        """Strips accents from a piece of text."""
-        nfd = unicodedata.normalize("NFD", line)
-        output = [c for c in nfd if unicodedata.category(c) != "Mn"]
-        if len(output) == line:
-            return line
-        return "".join(output)
-    def replace_unicode_punct(self, text: str) -> str:
-        return "".join(self.unicode_punct.get(c, c) for c in text)
-    def remove_unicode_punct(self, text: str) -> str:
-        """More aggressive version of replace_unicode_punct but also faster."""
-        return self.unicode_punct_re.sub("", text)
-    def remove_non_printing_char(self, text: str) -> str:
-        return self.non_printing_chars_re.sub("", text)