import re import nltk from typing import List from transformers import pipeline from tqdm import tqdm def tweet_cleaner(tweet: str) -> str: # words = set(nltk.corpus.words.words()) """ Cleans a tweet by removing @ mentions, URLs, hashtags, and non-valid words. Args: tweet (str): A single tweet as a string. Returns: str: The cleaned tweet. """ # Remove @ mentions from the tweet # tweet = re.sub("@[A-Za-z0-9]+", "", tweet) # # Remove URLs from the tweet # tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) # # Remove extra whitespaces from the tweet # tweet = " ".join(tweet.split()) # # Remove hashtag sign but keep the text # tweet = tweet.replace("#", "").replace("_", " ") # # Tokenize the tweet and keep only valid words # tweet = " ".join( # w # for w in nltk.wordpunct_tokenize(tweet) # if w.lower() in words or not w.isalpha() # ) # # Return the cleaned tweet # return tweet bad_start = ["http:", "https:"] for w in bad_start: tweet = re.sub(f" {w}\\S+", "", tweet) # removes white space before url tweet = re.sub(f"{w}\\S+ ", "", tweet) # in case a tweet starts with a url tweet = re.sub(f"\n{w}\\S+ ", "", tweet) # in case the url is on a new line tweet = re.sub( f"\n{w}\\S+", "", tweet ) # in case the url is alone on a new line tweet = re.sub(f"{w}\\S+", "", tweet) # any other case? tweet = re.sub(" +", " ", tweet) # replace multiple spaces with one space return " ".join(tweet.split()).strip() def is_boring_tweet(tweet): """Check if tweet is boring.""" boring_stuff = ["http", "@", "#"] not_boring_words = sum( 1 for word in tweet.split() if not any(bs in word.lower() for bs in boring_stuff) ) return not_boring_words < 3 def fix_text(text): text = text.replace("&", "&") text = text.replace("<", "<") text = text.replace(">", ">") return text def get_tweets_sentiment(tweets: List[str]) -> List[float]: """ Takes in a list of tweet texts and returns their sentiment scores as a list of floats between 0 and 1. Parameters: tweets (List[str]): A list of tweet texts. Returns: List[float]: A list of sentiment scores for the input tweets, where each score is a float between 0 and 1. """ # Load the sentiment analysis pipeline classifier = pipeline( "sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english" ) if type(tweets[0]) == dict: # Clean tweets tweet_texts = [tweet_cleaner(t["content"]) for t in tqdm(tweets)] else: tweet_texts = [tweet_cleaner(t) for t in tqdm(tweets)] # Get tweet sentiment score tweet_sentiments = classifier(tweet_texts) # Extract the sentiment score from each result and return as a list return [t["score"] for t in tweet_sentiments]