Spaces:
Running
on
T4
Running
on
T4
import re | |
# List of (regular expression, replacement) pairs for abbreviations in english: | |
abbreviations_en = [ | |
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) | |
for x in [ | |
("mrs", "misess"), | |
("mr", "mister"), | |
("dr", "doctor"), | |
("st", "saint"), | |
("co", "company"), | |
("jr", "junior"), | |
("maj", "major"), | |
("gen", "general"), | |
("drs", "doctors"), | |
("rev", "reverend"), | |
("lt", "lieutenant"), | |
("hon", "honorable"), | |
("sgt", "sergeant"), | |
("capt", "captain"), | |
("esq", "esquire"), | |
("ltd", "limited"), | |
("col", "colonel"), | |
("ft", "fort"), | |
] | |
] | |
def expand_abbreviations(text, lang="en"): | |
if lang == "en": | |
_abbreviations = abbreviations_en | |
else: | |
raise NotImplementedError() | |
for regex, replacement in _abbreviations: | |
text = re.sub(regex, replacement, text) | |
return text |