File size: 2,702 Bytes
0837e5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Modified from OpenAI's Whisper english_normalizer.

import re
import unicodedata
from typing import Iterable

# non-ASCII letters that are not separated by "NFKD" normalization
ADDITIONAL_DIACRITICS = {
    "œ": "oe",
    "Œ": "OE",
    "ø": "o",
    "Ø": "O",
    "æ": "ae",
    "Æ": "AE",
    "ß": "ss",
    "ẞ": "SS",
    "đ": "d",
    "Đ": "D",
    "ð": "d",
    "Ð": "D",
    "þ": "th",
    "Þ": "th",
    "ł": "l",
    "Ł": "L",
}

PORTUGUESE_ACCENTED_CHARACTERS = [
    "ç",
    "á",
    "é",
    "í",
    "ó",
    "ú",
    "â",
    "ê",
    "ô",
    "ã",
    "õ",
    "à",
    "ò",
    "è",
    "ì",
    "ù"
]

PORTUGUESE_DIACRITICS = ['̧', '̂', '̀', '̃', '́']


def remove_symbols_and_diacritics(s: str, keep: Iterable[str] = "") -> str:
    """
    Replace any other markers, symbols, and punctuations with a space,
    and drop any diacritics (category 'Mn' and some manual mappings)
    """
    return "".join(
        c
        if c in keep
        else ADDITIONAL_DIACRITICS[c]
        if c in ADDITIONAL_DIACRITICS
        else ""
        if unicodedata.category(c) == "Mn"
        else " "
        if unicodedata.category(c)[0] in "MSP"
        else c
        for c in unicodedata.normalize("NFKD", s)
    )


class PortugueseTextNormalizer:
    def __init__(self):
        self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh)\b"
        self.replacers = {
            # contractions in titles/prefixes
            r"\bsr\b": "senhor ",
            r"\bsra\b": "senhora ",
            r"\bsto\b": "santo ",
            r"\bsta\b": "santa ",
            r"\bdr\b": "doutor ",
            r"\bdra\b": "doutora ",
            r"\bprof\b": "professor ",
            r"\bcap\b": "capitão ",
        }

    def __call__(self, s: str):
        s = s.lower()

        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
        s = re.sub(self.ignore_patterns, "", s)

        for pattern, replacement in self.replacers.items():
            s = re.sub(pattern, replacement, s)

        # In english, one wold remove commas between digits (thousands separators)
        # and periods not followed by digits (decimals). But in portuguese, either comma or period
        # can be used as a decimal separator.
        s = re.sub(r"(\d),(\d)", r"\1\2", s)  # remove commas between digits
        s = re.sub(r"(\d)\.(\d)", r"\1\2", s)  # remove periods between digits

        s = remove_symbols_and_diacritics(s, keep=PORTUGUESE_DIACRITICS)

        s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space

        return s.lower()