import regex as re import string def keep_devnagri(text:str): """ Remove all non Devnagri characters from the text. Code adapted from https://huggingface.co/flax-community/roberta-base-mr/blob/64d2c745f264f09c3d5b678a718746b2613887db/mr_clean_text.py @param text: str Text to be cleaned @return: Union[str, bool] """ pattern = r'[\p{Devanagari}0-9ред\s\.\!]+' # regex pattern for all puntuation symbols punctuation_regex = re.compile("[" + re.escape(string.punctuation) + string.digits + "|" + "]") # keep only the text which is in devnagari script cleaned = "".join([tok.group() for tok in re.finditer(pattern, text)]) # remove any extra space between words cleaned = re.sub(r"[ ]+", " ", cleaned) # identify if the clean text only consists of punctuation is_just_punctuation = len(re.sub(punctuation_regex, "", cleaned)) == 0 return cleaned, is_just_punctuation def keep_devnagri_hf_doc(document): if isinstance(document['text'], str): batched = False elif isinstance(document['text'], list): batched = True else: raise TypeError("Document must be a dictionary or list.") def get_clean_text(text): cleaned_text, is_just_punctuation = keep_devnagri(text) # to handle the tokenizer as empty string may cause issues # also this only happens for 5 out of 10000 docs, should not # affect the results cleaned_text = cleaned_text if not is_just_punctuation else " " return cleaned_text if batched: document['text'] = [get_clean_text(text) for text in document['text']] else: document['text'] = get_clean_text(document['text']) return document