Spaces:

zArabi
/

Persian-Sentiment-Analysis

Runtime error

Persian-Sentiment-Analysis / preprocessing.py

Update preprocessing.py

3238f2f over 2 years ago

1.92 kB

	import hazm
	from cleantext import clean
	import regex as re

	def cleanhtml(raw_html):
	cleanr = re.compile('<.*?>')
	cleantext = re.sub(cleanr, '', raw_html)
	return cleantext

	def cleaning(text):
	text = text.strip()

	# regular cleaning
	# https://pypi.org/project/clean-text/ >> works well for eng and de languages
	text = clean(text,
	fix_unicode=True,
	to_ascii=False,
	lower=True,
	no_line_breaks=True,
	no_urls=True,
	no_emails=True,
	no_phone_numbers=True,
	no_numbers=False,
	no_digits=False,
	no_currency_symbols=True,
	no_punct=False, #Keep the punc
	replace_with_url="",
	replace_with_email="",
	replace_with_phone_number="",
	replace_with_number="",
	replace_with_digit="0",
	replace_with_currency_symbol="",
	)

	# cleaning htmls
	text = cleanhtml(text)

	# normalizing > https://github.com/sobhe/hazm
	normalizer = hazm.Normalizer()
	text = normalizer.normalize(text)

	# removing wierd patterns
	wierd_pattern = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	u"\U0001f926-\U0001f937"
	u'\U00010000-\U0010ffff'
	u"\u200d"
	u"\u2640-\u2642"
	u"\u2600-\u2B55"
	u"\u23cf"
	u"\u23e9"
	u"\u231a"
	u"\u3030"
	u"\ufe0f"
	u"\u2069"
	u"\u2066"
	# u"\u200c"
	u"\u2068"
	u"\u2067"
	"]+", flags=re.UNICODE)

	text = wierd_pattern.sub(r'', text)

	# removing extra spaces, hashtags
	text = re.sub("#", "", text)
	text = re.sub("\s+", " ", text)

	return text