panini / utils.py
amank
Made change to cleaning code, modified number of warmpu step, getting eval samples from validation split
7839b8e
raw
history blame
1.96 kB
import regex as re
import string
def keep_devnagri(text:str):
"""
Remove all non Devnagri characters from the text.
Code adapted from https://huggingface.co/flax-community/roberta-base-mr/blob/64d2c745f264f09c3d5b678a718746b2613887db/mr_clean_text.py
@param text: str Text to be cleaned
@return: Union[str, bool]
"""
pattern = r'[\p{Devanagari}0-9।\s\.\!]+'
# regex pattern for all puntuation symbols
punctuation_regex = re.compile("[" + re.escape(string.punctuation) + string.digits + "|" + "]")
# keep only the text which is in devnagari script
cleaned = "".join([tok.group() for tok in re.finditer(pattern, text)])
# remove any extra space between words
cleaned = re.sub(r"[ ]+", " ", cleaned)
# identify if the clean text only consists of punctuation
is_just_punctuation = len(re.sub(punctuation_regex, "", cleaned)) == 0
return cleaned, is_just_punctuation
def keep_devnagri_hf_doc(document):
if isinstance(document['text'], str):
batched = False
elif isinstance(document['text'], list):
batched = True
else:
raise TypeError("Document must be a dictionary or list.")
def get_clean_text(text):
cleaned_text, is_just_punctuation = keep_devnagri(text)
# to handle the tokenizer as empty string may cause issues
# also this only happens for 5 out of 10000 docs, should not
# affect the results
cleaned_text = cleaned_text if not is_just_punctuation else " "
return cleaned_text
if batched:
text_ls = document['text']
cleaned_text_ls = []
for text in text_ls:
cleaned_text = get_clean_text(text)
cleaned_text_ls.append(cleaned_text)
document['text'] = cleaned_text_ls
else:
text = document['text']
cleaned_text = get_clean_text(text)
document['text'] = cleaned_text
return document