import sys import string import regex as re # import re from datasets import load_dataset split_type = sys.argv[1] def clean_marathi_text(text): clean_text = "".join([tok.group() for tok in re.finditer(r'[\u0900-\u097F\s]', text)]) clean_text = re.sub(r"\s+", " ", clean_text) return clean_text def clean_marathi_text2(text): pattern1 = r'\P{Devanagari}+' pattern2 = r'[\p{Devanagari}\s\.\!]+' punct_rm = re.compile("[" + re.escape(string.punctuation) + "]") # cleaned = re.sub(pattern1, "", text) cleaned = "".join([tok.group() for tok in re.finditer(pattern2, text)]) cleaned = re.sub(r"[ ]+", " ", cleaned) clean_puct_rm = re.sub(punct_rm, "", cleaned) return cleaned, clean_puct_rm.strip() mr_mc4 = load_dataset("mc4", "mr", split=split_type) print(mr_mc4) count = 0 with open(f"mr_{split_type}_clean2.txt", 'w', encoding="utf-8") as f: for ind, each in enumerate(mr_mc4): clean_text, only_punct = clean_marathi_text2(each["text"]) if only_punct: count += 1 f.write(clean_text.strip()) f.write("\n\n") print(f"{split_type} clean docs count {count} out of {len(mr_mc4)}")