Spaces:
Sleeping
Sleeping
import json | |
import re | |
import unicodedata | |
from utils.norm_config import norm_config | |
def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False): | |
"""Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces | |
Args: | |
text : The string to be normalized | |
iso_code : | |
remove_numbers : Boolean flag to specify if words containing only digits should be removed | |
Returns: | |
normalized_text : the string after all normalization | |
""" | |
config = norm_config.get(iso_code, norm_config["*"]) | |
for field in ["lower_case", "punc_set","del_set", "mapping", "digit_set", "unicode_norm"]: | |
if field not in config: | |
config[field] = norm_config["*"][field] | |
text = unicodedata.normalize(config["unicode_norm"], text) | |
# Convert to lower case | |
if config["lower_case"] and lower_case: | |
text = text.lower() | |
# brackets | |
# always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)" | |
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text) | |
if remove_brackets: | |
text = re.sub(r"\([^\)]*\)", " ", text) | |
# Apply mappings | |
for old, new in config["mapping"].items(): | |
text = re.sub(old, new, text) | |
# Replace punctutations with space | |
punct_pattern = r"[" + config["punc_set"] | |
punct_pattern += "]" | |
normalized_text = re.sub(punct_pattern, " ", text) | |
# remove characters in delete list | |
delete_patten = r"[" + config["del_set"] + "]" | |
normalized_text = re.sub(delete_patten, "", normalized_text) | |
# Remove words containing only digits | |
# We check for 3 cases a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number | |
# For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space | |
# The lookaround enables overlapping pattern matches to be replaced | |
if remove_numbers: | |
digits_pattern = "[" + config["digit_set"] | |
digits_pattern += "]+" | |
complete_digit_pattern = ( | |
r"^" | |
+ digits_pattern | |
+ "(?=\s)|(?<=\s)" | |
+ digits_pattern | |
+ "(?=\s)|(?<=\s)" | |
+ digits_pattern | |
+ "$" | |
) | |
normalized_text = re.sub(complete_digit_pattern, " ", normalized_text) | |
if config["rm_diacritics"]: | |
from unidecode import unidecode | |
normalized_text = unidecode(normalized_text) | |
# Remove extra spaces | |
normalized_text = re.sub(r"\s+", " ", normalized_text).strip() | |
return normalized_text |