Spaces:

yaful
/

DeepfakeTextDetect

Runtime error

App Files Files Community

yaful commited on Jun 26, 2023

Commit

f507f90

1 Parent(s): 568da45

Create utils.py

Browse files

Files changed (1) hide show

utils.py +279 -0

utils.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import re
+import torch
+from cleantext import clean
+from itertools import chain
+class MosesPunctNormalizer:
+    """
+    This is a Python port of the Moses punctuation normalizer from
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
+    """
+    EXTRA_WHITESPACE = [  # lines 21 - 30
+        (r"\r", r""),
+        (r"\(", r" ("),
+        (r"\)", r") "),
+        (r" +", r" "),
+        (r"\) ([.!:?;,])", r")\g<1>"),
+        (r"\( ", r"("),
+        (r" \)", r")"),
+        (r"(\d) %", r"\g<1>%"),
+        (r" :", r":"),
+        (r" ;", r";"),
+    ]
+    NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')]  # lines 33 - 34
+    NORMALIZE_UNICODE = [  # lines 37 - 50
+        ("„", r'"'),
+        ("“", r'"'),
+        ("”", r'"'),
+        ("–", r"-"),
+        ("—", r" - "),
+        (r" +", r" "),
+        ("´", r"'"),
+        ("([a-zA-Z])‘([a-zA-Z])", r"\g<1>'\g<2>"),
+        ("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"),
+        ("‘", r"'"),
+        ("‚", r"'"),
+        ("’", r"'"),
+        (r"''", r'"'),
+        ("´´", r'"'),
+        ("…", r"..."),
+    ]
+    FRENCH_QUOTES = [  # lines 52 - 57
+        ("\u00A0«\u00A0", r'"'),
+        ("«\u00A0", r'"'),
+        ("«", r'"'),
+        ("\u00A0»\u00A0", r'"'),
+        ("\u00A0»", r'"'),
+        ("»", r'"'),
+    ]
+    HANDLE_PSEUDO_SPACES = [  # lines 59 - 67
+        ("\u00A0%", r"%"),
+        ("nº\u00A0", "nº "),
+        ("\u00A0:", r":"),
+        ("\u00A0ºC", " ºC"),
+        ("\u00A0cm", r" cm"),
+        ("\u00A0\\?", "?"),
+        ("\u00A0\\!", "!"),
+        ("\u00A0;", r";"),
+        (",\u00A0", r", "),
+        (r" +", r" "),
+    ]
+    EN_QUOTATION_FOLLOWED_BY_COMMA = [(r'"([,.]+)', r'\g<1>"')]
+    DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [
+        (r',"', r'",'),
+        (r'(\.+)"(\s*[^<])', r'"\g<1>\g<2>'),  # don't fix period at end of sentence
+    ]
+    DE_ES_CZ_CS_FR = [
+        ("(\\d)\u00A0(\\d)", r"\g<1>,\g<2>"),
+    ]
+    OTHER = [
+        ("(\\d)\u00A0(\\d)", r"\g<1>.\g<2>"),
+    ]
+    # Regex substitutions from replace-unicode-punctuation.perl
+    # https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
+    REPLACE_UNICODE_PUNCTUATION = [
+        ("，", ","),
+        (r"。\s*", ". "),
+        ("、", ","),
+        ("”", '"'),
+        ("“", '"'),
+        ("∶", ":"),
+        ("：", ":"),
+        ("？", "?"),
+        ("《", '"'),
+        ("》", '"'),
+        ("）", ")"),
+        ("！", "!"),
+        ("（", "("),
+        ("；", ";"),
+        ("」", '"'),
+        ("「", '"'),
+        ("０", "0"),
+        ("１", "1"),
+        ("２", "2"),
+        ("３", "3"),
+        ("４", "4"),
+        ("５", "5"),
+        ("６", "6"),
+        ("７", "7"),
+        ("８", "8"),
+        ("９", "9"),
+        (r"．\s*", ". "),
+        ("～", "~"),
+        ("’", "'"),
+        ("…", "..."),
+        ("━", "-"),
+        ("〈", "<"),
+        ("〉", ">"),
+        ("【", "["),
+        ("】", "]"),
+        ("％", "%"),
+    ]
+    def __init__(
+        self,
+        lang="en",
+        penn=True,
+        norm_quote_commas=True,
+        norm_numbers=True,
+        pre_replace_unicode_punct=False,
+        post_remove_control_chars=False,
+    ):
+        """
+        :param language: The two-letter language code.
+        :type lang: str
+        :param penn: Normalize Penn Treebank style quotations.
+        :type penn: bool
+        :param norm_quote_commas: Normalize quotations and commas
+        :type norm_quote_commas: bool
+        :param norm_numbers: Normalize numbers
+        :type norm_numbers: bool
+        """
+        self.substitutions = [
+            self.EXTRA_WHITESPACE,
+            self.NORMALIZE_UNICODE,
+            self.FRENCH_QUOTES,
+            self.HANDLE_PSEUDO_SPACES,
+        ]
+        if penn:  # Adds the penn substitutions after extra_whitespace regexes.
+            self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN)
+        if norm_quote_commas:
+            if lang == "en":
+                self.substitutions.append(self.EN_QUOTATION_FOLLOWED_BY_COMMA)
+            elif lang in ["de", "es", "fr"]:
+                self.substitutions.append(self.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA)
+        if norm_numbers:
+            if lang in ["de", "es", "cz", "cs", "fr"]:
+                self.substitutions.append(self.DE_ES_CZ_CS_FR)
+            else:
+                self.substitutions.append(self.OTHER)
+        self.substitutions = list(chain(*self.substitutions))
+        self.pre_replace_unicode_punct = pre_replace_unicode_punct
+        self.post_remove_control_chars = post_remove_control_chars
+    def normalize(self, text):
+        """
+        Returns a string with normalized punctuation.
+        """
+        # Optionally, replace unicode puncts BEFORE normalization.
+        if self.pre_replace_unicode_punct:
+            text = self.replace_unicode_punct(text)
+        # Actual normalization.
+        for regexp, substitution in self.substitutions:
+            # print(regexp, substitution)
+            text = re.sub(regexp, substitution, str(text))
+            # print(text)
+        # Optionally, replace unicode puncts BEFORE normalization.
+        if self.post_remove_control_chars:
+            text = self.remove_control_chars(text)
+        return text.strip()
+    def replace_unicode_punct(self, text):
+        for regexp, substitution in self.REPLACE_UNICODE_PUNCTUATION:
+            text = re.sub(regexp, substitution, str(text))
+        return text
+    def remove_control_chars(self, text):
+        return regex.sub(r"\p{C}", "", text)
+def _tokenization_norm(text):
+    text = text.replace(
+        ' ,', ',').replace(
+        ' .', '.').replace(
+        ' ?', '?').replace(
+        ' !', '!').replace(
+        ' ;', ';').replace(
+        ' \'', '\'').replace(
+        ' ’ ', '\'').replace(
+        ' :', ':').replace(
+        '<newline>', '\n').replace(
+        '`` ', '"').replace(
+        ' \'\'', '"').replace(
+        '\'\'', '"').replace(
+        '.. ', '... ').replace(
+        ' )', ')').replace(
+        '( ', '(').replace(
+        ' n\'t', 'n\'t').replace(
+        ' i ', ' I ').replace(
+        ' i\'', ' I\'').replace(
+        '\\\'', '\'').replace(
+        '\n ', '\n').strip()
+    return text
+def _clean_text(text):
+    # remove PLM special tokens
+    plm_special_tokens = r'(\<pad\>)|(\<s\>)|(\<\/s\>)|(\<unk\>)|(\<\|endoftext\|\>)'
+    text = re.sub(plm_special_tokens, "", text)
+    # normalize puncuations
+    moses_norm = MosesPunctNormalizer()
+    text = moses_norm.normalize(text)
+    # normalize tokenization
+    text = _tokenization_norm(text)
+    # remove specific text patterns, e.g,, url, email and phone number
+    text = clean(text,
+        fix_unicode=True,               # fix various unicode errors
+        to_ascii=True,                  # transliterate to closest ASCII representation
+        lower=False,                     # lowercase text
+        no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
+        no_urls=True,                  # replace all URLs with a special token
+        no_emails=True,                # replace all email addresses with a special token
+        no_phone_numbers=True,         # replace all phone numbers with a special token
+        no_numbers=False,               # replace all numbers with a special token
+        no_digits=False,                # replace all digits with a special token
+        no_currency_symbols=False,      # replace all currency symbols with a special token
+        no_punct=False,                 # remove punctuations
+        replace_with_punct="",          # instead of removing punctuations you may replace them
+        replace_with_url="",
+        replace_with_email="",
+        replace_with_phone_number="",
+        replace_with_number="<NUMBER>",
+        replace_with_digit="<DIGIT>",
+        replace_with_currency_symbol="<CUR>",
+        lang="en"                       # set to 'de' for German special handling
+    )
+    # keep common puncts only
+    punct_pattern = r'[^ A-Za-z0-9.?!,:;\-\[\]\{\}\(\)\'\"]'
+    text = re.sub(punct_pattern, '', text)
+    # remove specific patterns
+    spe_pattern = r'[-\[\]\{\}\(\)\'\"]{2,}'
+    text = re.sub(spe_pattern, '', text)
+    # remove redundate spaces
+    text = " ".join(text.split())
+    return text
+def _rm_line_break(text):
+    text = text.replace("\n","\\n")
+    text = re.sub(r'(?:\\n)*\\n', r'\\n', text)
+    text = re.sub(r'^.{0,3}\\n', '', text)
+    text = text.replace("\\n"," ")
+    return text
+def preprocess(text):
+    text = _rm_line_break(text)
+    text = _clean_text(text)
+    return text