File size: 1,748 Bytes
139e10d
 
 
7839b8e
139e10d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7839b8e
 
 
 
 
 
 
139e10d
7839b8e
 
 
 
 
 
 
 
 
 
 
7007f93
7839b8e
7007f93
7839b8e
139e10d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import regex as re
import string

def keep_devnagri(text:str):
    """
    Remove all non Devnagri characters from the text.
    Code adapted from https://huggingface.co/flax-community/roberta-base-mr/blob/64d2c745f264f09c3d5b678a718746b2613887db/mr_clean_text.py
    
    @param text: str Text to be cleaned
    @return: Union[str, bool]
    """
    pattern = r'[\p{Devanagari}0-9।\s\.\!]+'
    
    # regex pattern for all puntuation symbols
    punctuation_regex = re.compile("[" + re.escape(string.punctuation) + string.digits + "|" + "]")

    # keep only the text which is in devnagari script
    cleaned = "".join([tok.group() for tok in re.finditer(pattern, text)])

    # remove any extra space between words
    cleaned = re.sub(r"[ ]+", " ", cleaned)
    
    # identify if the clean text only consists of punctuation
    is_just_punctuation = len(re.sub(punctuation_regex, "", cleaned)) == 0
    
    return cleaned, is_just_punctuation

def keep_devnagri_hf_doc(document):
    if isinstance(document['text'], str):
        batched = False
    elif isinstance(document['text'], list):
        batched = True
    else:
        raise TypeError("Document must be a dictionary or list.")

    def get_clean_text(text):
        cleaned_text, is_just_punctuation = keep_devnagri(text)
        # to handle the tokenizer as empty string may cause issues
        # also this only happens for 5 out of 10000 docs, should not
        # affect the results
        cleaned_text = cleaned_text if not is_just_punctuation else " "
        return cleaned_text

    if batched:
        document['text'] = [get_clean_text(text) for text in document['text']]
    else:
        document['text'] = get_clean_text(document['text'])
    
    return document