roberta-base-mr / mr_clean_text.py
nipunsadvilkar's picture
Saving weights and logs of step 500
41593c6
import sys
import string
import regex as re
# import re
from datasets import load_dataset
split_type = sys.argv[1]
def clean_marathi_text(text):
clean_text = "".join([tok.group() for tok in re.finditer(r'[\u0900-\u097F\s]', text)])
clean_text = re.sub(r"\s+", " ", clean_text)
return clean_text
def clean_marathi_text2(text):
pattern1 = r'\P{Devanagari}+'
pattern2 = r'[\p{Devanagari}\s\.\!]+'
punct_rm = re.compile("[" + re.escape(string.punctuation) + "]")
# cleaned = re.sub(pattern1, "", text)
cleaned = "".join([tok.group() for tok in re.finditer(pattern2, text)])
cleaned = re.sub(r"[ ]+", " ", cleaned)
clean_puct_rm = re.sub(punct_rm, "", cleaned)
return cleaned, clean_puct_rm.strip()
mr_mc4 = load_dataset("mc4", "mr", split=split_type)
print(mr_mc4)
count = 0
with open(f"mr_{split_type}_clean2.txt", 'w', encoding="utf-8") as f:
for ind, each in enumerate(mr_mc4):
clean_text, only_punct = clean_marathi_text2(each["text"])
if only_punct:
count += 1
f.write(clean_text.strip())
f.write("\n\n")
print(f"{split_type} clean docs count {count} out of {len(mr_mc4)}")