import regex as re import string def keep_devnagri(document:str): """ Remove all non Devnagri characters from the text. Code adapted from https://huggingface.co/flax-community/roberta-base-mr/blob/64d2c745f264f09c3d5b678a718746b2613887db/mr_clean_text.py @param text: str Text to be cleaned @return: Union[str, bool] """ text = document['text'] pattern = r'[\p{Devanagari}0-9ред\s\.\!]+' # regex pattern for all puntuation symbols punctuation_regex = re.compile("[" + re.escape(string.punctuation) + string.digits + "|" + "]") # keep only the text which is in devnagari script cleaned = "".join([tok.group() for tok in re.finditer(pattern, text)]) # remove any extra space between words cleaned = re.sub(r"[ ]+", " ", cleaned) # identify if the clean text only consists of punctuation is_just_punctuation = len(re.sub(punctuation_regex, "", cleaned)) == 0 # to handle the tokenizer as empty string may cause issues # also this only happens for 5 out of 10000 docs, should not # affect the results if is_just_punctuation: document['text'] = " " else: document['text'] = cleaned return document