import regex as re
import string
def keep_devnagri(text:str):
Remove all non Devnagri characters from the text.
Code adapted from
@param text: str Text to be cleaned
@return: Union[str, bool]
pattern = r'[\p{Devanagari}0-9।\s\.\!]+'
# regex pattern for all puntuation symbols
punctuation_regex = re.compile("[" + re.escape(string.punctuation) + string.digits + "|" + "]")
# keep only the text which is in devnagari script
cleaned = "".join([ for tok in re.finditer(pattern, text)])
# remove any extra space between words
cleaned = re.sub(r"[ ]+", " ", cleaned)
# identify if the clean text only consists of punctuation
is_just_punctuation = len(re.sub(punctuation_regex, "", cleaned)) == 0
return cleaned, is_just_punctuation
def keep_devnagri_hf_doc(document):
if isinstance(document['text'], str):
batched = False
elif isinstance(document['text'], list):
batched = True
raise TypeError("Document must be a dictionary or list.")
def get_clean_text(text):
cleaned_text, is_just_punctuation = keep_devnagri(text)
# to handle the tokenizer as empty string may cause issues
# also this only happens for 5 out of 10000 docs, should not
# affect the results
cleaned_text = cleaned_text if not is_just_punctuation else " "
return cleaned_text
if batched:
text_ls = document['text']
cleaned_text_ls = []
for text in text_ls:
cleaned_text = get_clean_text(text)
document['text'] = cleaned_text_ls
text = document['text']
cleaned_text = get_clean_text(text)
document['text'] = cleaned_text
return document