flax-community
/

roberta-base-mr

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

roberta-base-mr / mr_clean_text.py

nipunsadvilkar's picture

Saving weights and logs of step 500

41593c6 almost 3 years ago

raw history blame contribute delete

No virus

1.18 kB

	import sys
	import string
	import regex as re
	# import re

	from datasets import load_dataset

	split_type = sys.argv[1]

	def clean_marathi_text(text):
	clean_text = "".join([tok.group() for tok in re.finditer(r'[\u0900-\u097F\s]', text)])
	clean_text = re.sub(r"\s+", " ", clean_text)
	return clean_text

	def clean_marathi_text2(text):
	pattern1 = r'\P{Devanagari}+'
	pattern2 = r'[\p{Devanagari}\s\.\!]+'
	punct_rm = re.compile("[" + re.escape(string.punctuation) + "]")
	# cleaned = re.sub(pattern1, "", text)

	cleaned = "".join([tok.group() for tok in re.finditer(pattern2, text)])
	cleaned = re.sub(r"[ ]+", " ", cleaned)
	clean_puct_rm = re.sub(punct_rm, "", cleaned)
	return cleaned, clean_puct_rm.strip()


	mr_mc4 = load_dataset("mc4", "mr", split=split_type)

	print(mr_mc4)

	count = 0
	with open(f"mr_{split_type}_clean2.txt", 'w', encoding="utf-8") as f:
	for ind, each in enumerate(mr_mc4):
	clean_text, only_punct = clean_marathi_text2(each["text"])
	if only_punct:
	count += 1
	f.write(clean_text.strip())
	f.write("\n\n")
	print(f"{split_type} clean docs count {count} out of {len(mr_mc4)}")