File size: 1,748 Bytes
139e10d 7839b8e 139e10d 7839b8e 139e10d 7839b8e 7007f93 7839b8e 7007f93 7839b8e 139e10d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import regex as re
import string
def keep_devnagri(text:str):
"""
Remove all non Devnagri characters from the text.
Code adapted from https://huggingface.co/flax-community/roberta-base-mr/blob/64d2c745f264f09c3d5b678a718746b2613887db/mr_clean_text.py
@param text: str Text to be cleaned
@return: Union[str, bool]
"""
pattern = r'[\p{Devanagari}0-9।\s\.\!]+'
# regex pattern for all puntuation symbols
punctuation_regex = re.compile("[" + re.escape(string.punctuation) + string.digits + "|" + "]")
# keep only the text which is in devnagari script
cleaned = "".join([tok.group() for tok in re.finditer(pattern, text)])
# remove any extra space between words
cleaned = re.sub(r"[ ]+", " ", cleaned)
# identify if the clean text only consists of punctuation
is_just_punctuation = len(re.sub(punctuation_regex, "", cleaned)) == 0
return cleaned, is_just_punctuation
def keep_devnagri_hf_doc(document):
if isinstance(document['text'], str):
batched = False
elif isinstance(document['text'], list):
batched = True
else:
raise TypeError("Document must be a dictionary or list.")
def get_clean_text(text):
cleaned_text, is_just_punctuation = keep_devnagri(text)
# to handle the tokenizer as empty string may cause issues
# also this only happens for 5 out of 10000 docs, should not
# affect the results
cleaned_text = cleaned_text if not is_just_punctuation else " "
return cleaned_text
if batched:
document['text'] = [get_clean_text(text) for text in document['text']]
else:
document['text'] = get_clean_text(document['text'])
return document |