TwitterAccounts / scripts /sentiment.py
aus10powell's picture
Upload 74 files
8158335
raw
history blame
3 kB
import re
import nltk
from typing import List
from transformers import pipeline
from tqdm import tqdm
def tweet_cleaner(tweet: str) -> str:
# words = set(nltk.corpus.words.words())
"""
Cleans a tweet by removing @ mentions, URLs, hashtags, and non-valid words.
Args:
tweet (str): A single tweet as a string.
Returns:
str: The cleaned tweet.
"""
# Remove @ mentions from the tweet
# tweet = re.sub("@[A-Za-z0-9]+", "", tweet)
# # Remove URLs from the tweet
# tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet)
# # Remove extra whitespaces from the tweet
# tweet = " ".join(tweet.split())
# # Remove hashtag sign but keep the text
# tweet = tweet.replace("#", "").replace("_", " ")
# # Tokenize the tweet and keep only valid words
# tweet = " ".join(
# w
# for w in nltk.wordpunct_tokenize(tweet)
# if w.lower() in words or not w.isalpha()
# )
# # Return the cleaned tweet
# return tweet
bad_start = ["http:", "https:"]
for w in bad_start:
tweet = re.sub(f" {w}\\S+", "", tweet) # removes white space before url
tweet = re.sub(f"{w}\\S+ ", "", tweet) # in case a tweet starts with a url
tweet = re.sub(f"\n{w}\\S+ ", "", tweet) # in case the url is on a new line
tweet = re.sub(
f"\n{w}\\S+", "", tweet
) # in case the url is alone on a new line
tweet = re.sub(f"{w}\\S+", "", tweet) # any other case?
tweet = re.sub(" +", " ", tweet) # replace multiple spaces with one space
return " ".join(tweet.split()).strip()
def is_boring_tweet(tweet):
"""Check if tweet is boring."""
boring_stuff = ["http", "@", "#"]
not_boring_words = sum(
1
for word in tweet.split()
if not any(bs in word.lower() for bs in boring_stuff)
)
return not_boring_words < 3
def fix_text(text):
text = text.replace("&amp;", "&")
text = text.replace("&lt;", "<")
text = text.replace("&gt;", ">")
return text
def get_tweets_sentiment(tweets: List[str]) -> List[float]:
"""
Takes in a list of tweet texts and returns their sentiment scores as a list of floats between 0 and 1.
Parameters:
tweets (List[str]): A list of tweet texts.
Returns:
List[float]: A list of sentiment scores for the input tweets, where each score is a float between 0 and 1.
"""
# Load the sentiment analysis pipeline
classifier = pipeline(
"sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english"
)
if type(tweets[0]) == dict:
# Clean tweets
tweet_texts = [tweet_cleaner(t["content"]) for t in tqdm(tweets)]
else:
tweet_texts = [tweet_cleaner(t) for t in tqdm(tweets)]
# Get tweet sentiment score
tweet_sentiments = classifier(tweet_texts)
# Extract the sentiment score from each result and return as a list
return [t["score"] for t in tweet_sentiments]