File size: 1,184 Bytes

41593c6

import sys
import string
import regex as re
# import re

from datasets import load_dataset

split_type = sys.argv[1]

def clean_marathi_text(text):
    clean_text = "".join([tok.group() for tok in re.finditer(r'[\u0900-\u097F\s]', text)])
    clean_text = re.sub(r"\s+", " ", clean_text)
    return clean_text

def clean_marathi_text2(text):
    pattern1 = r'\P{Devanagari}+'
    pattern2 = r'[\p{Devanagari}\s\.\!]+'
    punct_rm = re.compile("[" + re.escape(string.punctuation) + "]")
    # cleaned = re.sub(pattern1, "", text)

    cleaned = "".join([tok.group() for tok in re.finditer(pattern2, text)])
    cleaned = re.sub(r"[ ]+", " ", cleaned)
    clean_puct_rm = re.sub(punct_rm, "", cleaned)
    return cleaned, clean_puct_rm.strip()


mr_mc4 = load_dataset("mc4", "mr", split=split_type)

print(mr_mc4)

count = 0
with open(f"mr_{split_type}_clean2.txt", 'w', encoding="utf-8") as f:
    for ind, each in enumerate(mr_mc4):
        clean_text, only_punct = clean_marathi_text2(each["text"])
        if only_punct:
            count += 1
            f.write(clean_text.strip())
            f.write("\n\n")
print(f"{split_type} clean docs count {count} out of {len(mr_mc4)}")