Spaces:
Build error
Build error
File size: 3,745 Bytes
bcfd9f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import argparse
import re
import inflect
from training import DEFAULT_ALPHABET
INFLECT_ENGINE = inflect.engine()
COMMA_NUMBER_RE = re.compile(r"([0-9][0-9\,]+[0-9])")
DECIMAL_NUMBER_RE = re.compile(r"([0-9]+\.[0-9]+)")
NUMBER_RE = re.compile(r"[0-9]+")
ORDINALS = re.compile(r"([0-9]+[st|nd|rd|th]+)")
CURRENCY = re.compile(r"([£|$|€]+[0-9]+)")
WHITESPACE_RE = re.compile(r"\s+")
ALLOWED_CHARACTERS_RE = re.compile("[^a-z ,.!?'-]+")
MONETARY_REPLACEMENT = {"$": " dollars", "£": " pounds", "€": " euros"}
ABBREVIATION_REPLACEMENT = {
"mr.": "mister",
"mrs.": "misess",
"dr.": "doctor",
"no.": "number",
"st.": "saint",
"co.": "company",
"jr.": "junior",
"maj.": "major",
"gen.": "general",
"drs.": "doctors",
"rev.": "reverend",
"lt.": "lieutenant",
"hon.": "honorable",
"sgt.": "sergeant",
"capt.": "captain",
"esq.": "esquire",
"ltd.": "limited",
"col.": "colonel",
"ft.": "fort",
}
def clean_text(text, symbols=DEFAULT_ALPHABET, remove_invalid_characters=True):
"""
Cleans text. This includes:
- Replacing monetary terms (i.e. $ -> dollars)
- Converting ordinals to full words (i.e. 1st -> first)
- Converting numbers to their full word format (i.e. 100 -> one hundred)
- Replacing abbreviations (i.e. dr. -> doctor)
- Removing invalid characters (non utf-8 or invalid punctuation)
Parameters
----------
text : str
Text to clean
symbols : list (optional)
List of valid symbols in text (default is English alphabet & punctuation)
remove_invalid_characters : bool (optional)
Whether to remove characters not in symbols list (default is True)
Returns
-------
str
Cleaned text
"""
text = text.strip()
text = text.lower()
# Convert currency to words
money = re.findall(CURRENCY, text)
for amount in money:
for key, value in MONETARY_REPLACEMENT.items():
if key in amount:
text = text.replace(amount, amount[1:] + value)
# Convert ordinals to words
ordinals = re.findall(ORDINALS, text)
for ordinal in ordinals:
text = text.replace(ordinal, INFLECT_ENGINE.number_to_words(ordinal))
# Convert comma & decimal numbers to words
numbers = re.findall(COMMA_NUMBER_RE, text) + re.findall(DECIMAL_NUMBER_RE, text)
for number in numbers:
text = text.replace(number, INFLECT_ENGINE.number_to_words(number))
# Convert standard numbers to words
numbers = re.findall(NUMBER_RE, text)
for number in numbers:
text = text.replace(number, INFLECT_ENGINE.number_to_words(number))
# Replace abbreviations
for key, value in ABBREVIATION_REPLACEMENT.items():
text = text.replace(" " + key + " ", " " + value + " ")
# Collapse whitespace
text = re.sub(WHITESPACE_RE, " ", text)
# Remove banned characters
if remove_invalid_characters:
text = "".join([c for c in text if c in symbols])
return text
if __name__ == "__main__":
"""Script to clean text for training"""
parser = argparse.ArgumentParser(description="Clean & improve text for training")
parser.add_argument("-f", "--file", help="Text file path", type=str, required=True)
parser.add_argument("-o", "--output", help="Output text file path", type=str, required=True)
args = parser.parse_args()
with open(args.file) as f:
rows = f.readlines()
cleaned_text = []
for row in rows:
filename, text = row.split("|")
text = clean_text(text)
cleaned_text.append(f"{filename}|{text}")
with open(args.output, "w") as f:
for line in cleaned_text:
f.write(line)
f.write("\n")
|