File size: 3,745 Bytes
bcfd9f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import argparse
import re

import inflect
from training import DEFAULT_ALPHABET

INFLECT_ENGINE = inflect.engine()
COMMA_NUMBER_RE = re.compile(r"([0-9][0-9\,]+[0-9])")
DECIMAL_NUMBER_RE = re.compile(r"([0-9]+\.[0-9]+)")
NUMBER_RE = re.compile(r"[0-9]+")
ORDINALS = re.compile(r"([0-9]+[st|nd|rd|th]+)")
CURRENCY = re.compile(r"([£|$|€]+[0-9]+)")
WHITESPACE_RE = re.compile(r"\s+")
ALLOWED_CHARACTERS_RE = re.compile("[^a-z ,.!?'-]+")
MONETARY_REPLACEMENT = {"$": " dollars", "£": " pounds", "€": " euros"}
ABBREVIATION_REPLACEMENT = {
    "mr.": "mister",
    "mrs.": "misess",
    "dr.": "doctor",
    "no.": "number",
    "st.": "saint",
    "co.": "company",
    "jr.": "junior",
    "maj.": "major",
    "gen.": "general",
    "drs.": "doctors",
    "rev.": "reverend",
    "lt.": "lieutenant",
    "hon.": "honorable",
    "sgt.": "sergeant",
    "capt.": "captain",
    "esq.": "esquire",
    "ltd.": "limited",
    "col.": "colonel",
    "ft.": "fort",
}


def clean_text(text, symbols=DEFAULT_ALPHABET, remove_invalid_characters=True):
    """
    Cleans text. This includes:
    - Replacing monetary terms (i.e. $ -> dollars)
    - Converting ordinals to full words (i.e. 1st -> first)
    - Converting numbers to their full word format (i.e. 100 -> one hundred)
    - Replacing abbreviations (i.e. dr. -> doctor)
    - Removing invalid characters (non utf-8 or invalid punctuation)

    Parameters
    ----------
    text : str
        Text to clean
    symbols : list (optional)
        List of valid symbols in text (default is English alphabet & punctuation)
    remove_invalid_characters : bool (optional)
        Whether to remove characters not in symbols list (default is True)

    Returns
    -------
    str
        Cleaned text
    """
    text = text.strip()
    text = text.lower()
    # Convert currency to words
    money = re.findall(CURRENCY, text)
    for amount in money:
        for key, value in MONETARY_REPLACEMENT.items():
            if key in amount:
                text = text.replace(amount, amount[1:] + value)
    # Convert ordinals to words
    ordinals = re.findall(ORDINALS, text)
    for ordinal in ordinals:
        text = text.replace(ordinal, INFLECT_ENGINE.number_to_words(ordinal))
    # Convert comma & decimal numbers to words
    numbers = re.findall(COMMA_NUMBER_RE, text) + re.findall(DECIMAL_NUMBER_RE, text)
    for number in numbers:
        text = text.replace(number, INFLECT_ENGINE.number_to_words(number))
    # Convert standard numbers to words
    numbers = re.findall(NUMBER_RE, text)
    for number in numbers:
        text = text.replace(number, INFLECT_ENGINE.number_to_words(number))
    # Replace abbreviations
    for key, value in ABBREVIATION_REPLACEMENT.items():
        text = text.replace(" " + key + " ", " " + value + " ")
    # Collapse whitespace
    text = re.sub(WHITESPACE_RE, " ", text)
    # Remove banned characters
    if remove_invalid_characters:
        text = "".join([c for c in text if c in symbols])
    return text


if __name__ == "__main__":
    """Script to clean text for training"""
    parser = argparse.ArgumentParser(description="Clean & improve text for training")
    parser.add_argument("-f", "--file", help="Text file path", type=str, required=True)
    parser.add_argument("-o", "--output", help="Output text file path", type=str, required=True)
    args = parser.parse_args()

    with open(args.file) as f:
        rows = f.readlines()

    cleaned_text = []

    for row in rows:
        filename, text = row.split("|")
        text = clean_text(text)
        cleaned_text.append(f"{filename}|{text}")

    with open(args.output, "w") as f:
        for line in cleaned_text:
            f.write(line)
            f.write("\n")