import regex as re
import string

def keep_devnagri(document:str):
    """
    Remove all non Devnagri characters from the text.
    Code adapted from https://huggingface.co/flax-community/roberta-base-mr/blob/64d2c745f264f09c3d5b678a718746b2613887db/mr_clean_text.py
    
    @param text: str Text to be cleaned
    @return: Union[str, bool]
    """
    text = document['text']
    pattern = r'[\p{Devanagari}0-9।\s\.\!]+'
    
    # regex pattern for all puntuation symbols
    punctuation_regex = re.compile("[" + re.escape(string.punctuation) + string.digits + "|" + "]")

    # keep only the text which is in devnagari script
    cleaned = "".join([tok.group() for tok in re.finditer(pattern, text)])

    # remove any extra space between words
    cleaned = re.sub(r"[ ]+", " ", cleaned)
    
    # identify if the clean text only consists of punctuation
    is_just_punctuation = len(re.sub(punctuation_regex, "", cleaned)) == 0
    
    # to handle the tokenizer as empty string may cause issues
    # also this only happens for 5 out of 10000 docs, should not
    # affect the results
    if is_just_punctuation:
        document['text'] = " "
    else:
        document['text'] = cleaned
    return document