panini / utils.py

amank

Made change to cleaning code, modified number of warmpu step, getting eval samples from validation split

7839b8e about 3 years ago

1.96 kB

	import regex as re
	import string

	def keep_devnagri(text:str):
	"""
	Remove all non Devnagri characters from the text.
	Code adapted from https://huggingface.co/flax-community/roberta-base-mr/blob/64d2c745f264f09c3d5b678a718746b2613887db/mr_clean_text.py

	@param text: str Text to be cleaned
	@return: Union[str, bool]
	"""
	pattern = r'[\p{Devanagari}0-9।\s\.\!]+'

	# regex pattern for all puntuation symbols
	punctuation_regex = re.compile("[" + re.escape(string.punctuation) + string.digits + "\|" + "]")

	# keep only the text which is in devnagari script
	cleaned = "".join([tok.group() for tok in re.finditer(pattern, text)])

	# remove any extra space between words
	cleaned = re.sub(r"[ ]+", " ", cleaned)

	# identify if the clean text only consists of punctuation
	is_just_punctuation = len(re.sub(punctuation_regex, "", cleaned)) == 0

	return cleaned, is_just_punctuation

	def keep_devnagri_hf_doc(document):
	if isinstance(document['text'], str):
	batched = False
	elif isinstance(document['text'], list):
	batched = True
	else:
	raise TypeError("Document must be a dictionary or list.")

	def get_clean_text(text):
	cleaned_text, is_just_punctuation = keep_devnagri(text)
	# to handle the tokenizer as empty string may cause issues
	# also this only happens for 5 out of 10000 docs, should not
	# affect the results
	cleaned_text = cleaned_text if not is_just_punctuation else " "
	return cleaned_text

	if batched:
	text_ls = document['text']
	cleaned_text_ls = []
	for text in text_ls:
	cleaned_text = get_clean_text(text)
	cleaned_text_ls.append(cleaned_text)
	document['text'] = cleaned_text_ls
	else:
	text = document['text']
	cleaned_text = get_clean_text(text)
	document['text'] = cleaned_text

	return document