amank
Made change to cleaning code, modified number of warmpu step, getting eval samples from validation split
7839b8e
import regex as re | |
import string | |
def keep_devnagri(text:str): | |
""" | |
Remove all non Devnagri characters from the text. | |
Code adapted from https://huggingface.co/flax-community/roberta-base-mr/blob/64d2c745f264f09c3d5b678a718746b2613887db/mr_clean_text.py | |
@param text: str Text to be cleaned | |
@return: Union[str, bool] | |
""" | |
pattern = r'[\p{Devanagari}0-9।\s\.\!]+' | |
# regex pattern for all puntuation symbols | |
punctuation_regex = re.compile("[" + re.escape(string.punctuation) + string.digits + "|" + "]") | |
# keep only the text which is in devnagari script | |
cleaned = "".join([tok.group() for tok in re.finditer(pattern, text)]) | |
# remove any extra space between words | |
cleaned = re.sub(r"[ ]+", " ", cleaned) | |
# identify if the clean text only consists of punctuation | |
is_just_punctuation = len(re.sub(punctuation_regex, "", cleaned)) == 0 | |
return cleaned, is_just_punctuation | |
def keep_devnagri_hf_doc(document): | |
if isinstance(document['text'], str): | |
batched = False | |
elif isinstance(document['text'], list): | |
batched = True | |
else: | |
raise TypeError("Document must be a dictionary or list.") | |
def get_clean_text(text): | |
cleaned_text, is_just_punctuation = keep_devnagri(text) | |
# to handle the tokenizer as empty string may cause issues | |
# also this only happens for 5 out of 10000 docs, should not | |
# affect the results | |
cleaned_text = cleaned_text if not is_just_punctuation else " " | |
return cleaned_text | |
if batched: | |
text_ls = document['text'] | |
cleaned_text_ls = [] | |
for text in text_ls: | |
cleaned_text = get_clean_text(text) | |
cleaned_text_ls.append(cleaned_text) | |
document['text'] = cleaned_text_ls | |
else: | |
text = document['text'] | |
cleaned_text = get_clean_text(text) | |
document['text'] = cleaned_text | |
return document |