amankhandelia
/

panini

Inference Endpoints

Model card Files Files and versions Community

panini / utils.py

amank

Minor Code cleanup in utils file

7007f93 almost 3 years ago

history blame contribute delete

No virus

1.75 kB

	import regex as re
	import string

	def keep_devnagri(text:str):
	"""
	Remove all non Devnagri characters from the text.
	Code adapted from https://huggingface.co/flax-community/roberta-base-mr/blob/64d2c745f264f09c3d5b678a718746b2613887db/mr_clean_text.py

	@param text: str Text to be cleaned
	@return: Union[str, bool]
	"""
	pattern = r'[\p{Devanagari}0-9।\s\.\!]+'

	# regex pattern for all puntuation symbols
	punctuation_regex = re.compile("[" + re.escape(string.punctuation) + string.digits + "\|" + "]")

	# keep only the text which is in devnagari script
	cleaned = "".join([tok.group() for tok in re.finditer(pattern, text)])

	# remove any extra space between words
	cleaned = re.sub(r"[ ]+", " ", cleaned)

	# identify if the clean text only consists of punctuation
	is_just_punctuation = len(re.sub(punctuation_regex, "", cleaned)) == 0

	return cleaned, is_just_punctuation

	def keep_devnagri_hf_doc(document):
	if isinstance(document['text'], str):
	batched = False
	elif isinstance(document['text'], list):
	batched = True
	else:
	raise TypeError("Document must be a dictionary or list.")

	def get_clean_text(text):
	cleaned_text, is_just_punctuation = keep_devnagri(text)
	# to handle the tokenizer as empty string may cause issues
	# also this only happens for 5 out of 10000 docs, should not
	# affect the results
	cleaned_text = cleaned_text if not is_just_punctuation else " "
	return cleaned_text

	if batched:
	document['text'] = [get_clean_text(text) for text in document['text']]
	else:
	document['text'] = get_clean_text(document['text'])

	return document