|
import sys |
|
import string |
|
import regex as re |
|
|
|
|
|
from datasets import load_dataset |
|
|
|
split_type = sys.argv[1] |
|
|
|
def clean_marathi_text(text): |
|
clean_text = "".join([tok.group() for tok in re.finditer(r'[\u0900-\u097F\s]', text)]) |
|
clean_text = re.sub(r"\s+", " ", clean_text) |
|
return clean_text |
|
|
|
def clean_marathi_text2(text): |
|
pattern1 = r'\P{Devanagari}+' |
|
pattern2 = r'[\p{Devanagari}\s\.\!]+' |
|
punct_rm = re.compile("[" + re.escape(string.punctuation) + "]") |
|
|
|
|
|
cleaned = "".join([tok.group() for tok in re.finditer(pattern2, text)]) |
|
cleaned = re.sub(r"[ ]+", " ", cleaned) |
|
clean_puct_rm = re.sub(punct_rm, "", cleaned) |
|
return cleaned, clean_puct_rm.strip() |
|
|
|
|
|
mr_mc4 = load_dataset("mc4", "mr", split=split_type) |
|
|
|
print(mr_mc4) |
|
|
|
count = 0 |
|
with open(f"mr_{split_type}_clean2.txt", 'w', encoding="utf-8") as f: |
|
for ind, each in enumerate(mr_mc4): |
|
clean_text, only_punct = clean_marathi_text2(each["text"]) |
|
if only_punct: |
|
count += 1 |
|
f.write(clean_text.strip()) |
|
f.write("\n\n") |
|
print(f"{split_type} clean docs count {count} out of {len(mr_mc4)}") |
|
|