import re def clean_text(text: str) -> str: """ Cleans and preprocesses the input text by removing unwanted elements such as HTML tags, URLs, special characters, and extra whitespace. This function is useful for preparing text data for further processing or analysis. Parameters: ----------- text : str The input text to be cleaned. This text may contain HTML tags, URLs, special characters, multiple spaces, and unnecessary whitespace. Returns: -------- str A cleaned version of the input text with the following modifications: - HTML tags removed - URLs removed - Special characters (other than letters, digits, and spaces) removed - Multiple consecutive spaces replaced with a single space - Leading and trailing whitespace removed - Extra spaces between words reduced to a single space Example: -------- >>> clean_text("

Hello World! Visit http://example.com for more info.

") 'Hello World Visit for more info' """ # Remove HTML tags text = re.sub(r'<[^>]*?>', '', text) # Remove URLs text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) # Remove special characters (anything that is not a letter, number, or space) text = re.sub(r'[^a-zA-Z0-9 ]', '', text) # Replace multiple spaces with a single space text = re.sub(r'\s{2,}', ' ', text) # Trim leading and trailing whitespace text = text.strip() # Remove extra whitespace between words (in case of multiple spaces) text = ' '.join(text.split()) return text