Spaces:
Configuration error
Configuration error
| import re | |
| import string | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import TweetTokenizer | |
| nltk.download("stopwords") | |
| def clean_tweet(tweet:str) -> str: | |
| """ | |
| Convert all text to lowercase, remove stock market tickers, RT symbol, hyperlinks and the hastag symbol | |
| :param tweet: tweet by a unique user | |
| :return: cleaned string without hashtags, emojis, and punctuation | |
| """ | |
| # make text lower case | |
| tweet = tweet.lower() | |
| # remove stock market tickers like $GE | |
| tweet = re.sub(r'\$\w*', '', str(tweet)) | |
| # remove old style retweet text "RT" | |
| tweet = re.sub(r'^RT[\s]+', '', str(tweet)) | |
| # remove hyperlinks | |
| tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', str(tweet)) | |
| # remove hashtags | |
| # only removing the hash # sign from the word | |
| tweet = re.sub(r'#', '', str(tweet)) | |
| # remove punctuation | |
| punct = set(string.punctuation) | |
| tweet = "".join(ch for ch in tweet if ch not in punct) | |
| # remove stopwords | |
| stop_words = set(stopwords.words("english")) | |
| tweet = " ".join(word for word in tweet.split() if word not in stop_words) | |
| return tweet |