Spaces:

tejacherukuri
/

ProSpectAI

Running

ProSpectAI / src /utils.py

TejaCherukuri

prospectai code initial version

3d74a95 5 months ago

1.71 kB

	import re

	def clean_text(text: str) -> str:
	"""
	Cleans and preprocesses the input text by removing unwanted elements such as HTML tags, URLs,
	special characters, and extra whitespace. This function is useful for preparing text data for
	further processing or analysis.

	Parameters:
	-----------
	text : str
	The input text to be cleaned. This text may contain HTML tags, URLs, special characters,
	multiple spaces, and unnecessary whitespace.

	Returns:
	--------
	str
	A cleaned version of the input text with the following modifications:
	- HTML tags removed
	- URLs removed
	- Special characters (other than letters, digits, and spaces) removed
	- Multiple consecutive spaces replaced with a single space
	- Leading and trailing whitespace removed
	- Extra spaces between words reduced to a single space

	Example:
	--------
	>>> clean_text("<p>Hello <b>World</b>! Visit http://example.com for more info.</p>")
	'Hello World Visit for more info'
	"""

	# Remove HTML tags
	text = re.sub(r'<[^>]*?>', '', text)

	# Remove URLs
	text = re.sub(r'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\$\$,]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

	# Remove special characters (anything that is not a letter, number, or space)
	text = re.sub(r'[^a-zA-Z0-9 ]', '', text)

	# Replace multiple spaces with a single space
	text = re.sub(r'\s{2,}', ' ', text)

	# Trim leading and trailing whitespace
	text = text.strip()

	# Remove extra whitespace between words (in case of multiple spaces)
	text = ' '.join(text.split())

	return text