flax-community
/

gpt2-medium-persian

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

gpt2-medium-persian / src /data_utils.py

saied's picture

pushing tokenizer

c36ebf7 over 3 years ago

1.22 kB

	from hazm import word_tokenize
	from hazm import sent_tokenize
	import re
	import six

	from normalizer import normalize

	persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"


	def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"):
	candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
	text = text.replace(" ", "")

	return (len(candidate_text) / len(text)) > ratio


	def filter_by_num_tokens(text, gt=64):
	return len(word_tokenize(text)) > gt


	def filter_by_num_sents(text, gt=2):
	return len(sent_tokenize(text)) > gt


	def filter_by_adv(text, ratio=50):
	comma = text.split(",")
	colon = re.findall(r"""(?:([^\W]+):([^\W]+))""", text)
	virgool = text.split("،")
	length_add = len(comma) + len(colon) + len(virgool)

	return length_add < ratio


	# def normalizer(text, do_lowercase=False):
	# text = normalize(text)

	# if do_lowercase:
	# text = text.lower()

	# return text
	def normalizer(example):
	example["text"] = normalize(example["text"])
	return example