|
import re |
|
|
|
import inflect |
|
|
|
|
|
__INFLECT = inflect.engine() |
|
__COMMA_NUMBER_PATTERN = re.compile(r"([0-9][0-9\,]+[0-9])") |
|
__DECIMAL_NUMBER_PATTERN = re.compile(r"([0-9]+\.[0-9]+)") |
|
__POUNDS_PATTERN = re.compile(r"£([0-9\,]*[0-9]+)") |
|
__DOLLARS_PATTERN = re.compile(r"\$([0-9\.\,]*[0-9]+)") |
|
__ORDINAL_PATTERN = re.compile(r"[0-9]+(st|nd|rd|th)") |
|
__NUMBER_PATTERN = re.compile(r"[0-9]+") |
|
|
|
|
|
def normalize_text(text: str) -> str: |
|
text = __normalize_numbers(text) |
|
text = replace_punctuation(text) |
|
text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text) |
|
return text |
|
|
|
|
|
def replace_punctuation(text: str) -> str: |
|
REPLACE_MAP = { |
|
":": ",", |
|
";": ",", |
|
",": ",", |
|
"。": ".", |
|
"!": "!", |
|
"?": "?", |
|
"\n": ".", |
|
".": ".", |
|
"…": "...", |
|
"···": "...", |
|
"・・・": "...", |
|
"·": ",", |
|
"・": ",", |
|
"、": ",", |
|
"$": ".", |
|
"“": "'", |
|
"”": "'", |
|
'"': "'", |
|
"‘": "'", |
|
"’": "'", |
|
"(": "'", |
|
")": "'", |
|
"(": "'", |
|
")": "'", |
|
"《": "'", |
|
"》": "'", |
|
"【": "'", |
|
"】": "'", |
|
"[": "'", |
|
"]": "'", |
|
"—": "-", |
|
"−": "-", |
|
"~": "-", |
|
"~": "-", |
|
"「": "'", |
|
"」": "'", |
|
} |
|
pattern = re.compile("|".join(re.escape(p) for p in REPLACE_MAP)) |
|
replaced_text = pattern.sub(lambda x: REPLACE_MAP[x.group()], text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return replaced_text |
|
|
|
|
|
def __normalize_numbers(text: str) -> str: |
|
text = re.sub(__COMMA_NUMBER_PATTERN, __remove_commas, text) |
|
text = re.sub(__POUNDS_PATTERN, r"\1 pounds", text) |
|
text = re.sub(__DOLLARS_PATTERN, __expand_dollars, text) |
|
text = re.sub(__DECIMAL_NUMBER_PATTERN, __expand_decimal_point, text) |
|
text = re.sub(__ORDINAL_PATTERN, __expand_ordinal, text) |
|
text = re.sub(__NUMBER_PATTERN, __expand_number, text) |
|
return text |
|
|
|
|
|
def __expand_dollars(m: re.Match[str]) -> str: |
|
match = m.group(1) |
|
parts = match.split(".") |
|
if len(parts) > 2: |
|
return match + " dollars" |
|
dollars = int(parts[0]) if parts[0] else 0 |
|
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 |
|
if dollars and cents: |
|
dollar_unit = "dollar" if dollars == 1 else "dollars" |
|
cent_unit = "cent" if cents == 1 else "cents" |
|
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit) |
|
elif dollars: |
|
dollar_unit = "dollar" if dollars == 1 else "dollars" |
|
return "%s %s" % (dollars, dollar_unit) |
|
elif cents: |
|
cent_unit = "cent" if cents == 1 else "cents" |
|
return "%s %s" % (cents, cent_unit) |
|
else: |
|
return "zero dollars" |
|
|
|
|
|
def __remove_commas(m: re.Match[str]) -> str: |
|
return m.group(1).replace(",", "") |
|
|
|
|
|
def __expand_ordinal(m: re.Match[str]) -> str: |
|
return __INFLECT.number_to_words(m.group(0)) |
|
|
|
|
|
def __expand_number(m: re.Match[str]) -> str: |
|
num = int(m.group(0)) |
|
if num > 1000 and num < 3000: |
|
if num == 2000: |
|
return "two thousand" |
|
elif num > 2000 and num < 2010: |
|
return "two thousand " + __INFLECT.number_to_words(num % 100) |
|
elif num % 100 == 0: |
|
return __INFLECT.number_to_words(num // 100) + " hundred" |
|
else: |
|
return __INFLECT.number_to_words( |
|
num, andword="", zero="oh", group=2 |
|
).replace( |
|
", ", " " |
|
) |
|
else: |
|
return __INFLECT.number_to_words(num, andword="") |
|
|
|
|
|
def __expand_decimal_point(m: re.Match[str]) -> str: |
|
return m.group(1).replace(".", " point ") |
|
|