TwitterAccounts / scripts /sentiment.py
aus10powell's picture
Update scripts/sentiment.py
9f54156
import re
import nltk
from typing import List
from transformers import pipeline
from tqdm import tqdm
import numpy as np
import numpy as np
import scipy
def tweet_cleaner(tweet: str) -> str:
# words = set(nltk.corpus.words.words())
"""
Cleans a tweet by removing @ mentions, URLs, hashtags, and non-valid words.
Args:
tweet (str): A single tweet as a string.
Returns:
str: The cleaned tweet.
"""
if not isinstance(tweet, str):
try:
tweet = str(tweet)
except Exception as e:
print(f"Error converting tweet to string: {e}")
return tweet
bad_start = ["http:", "https:"]
for w in bad_start:
tweet = re.sub(f" {w}\\S+", "", tweet) # remove white space before url
tweet = re.sub(f"{w}\\S+ ", "", tweet) # in case a tweet starts with a url
tweet = re.sub(f"\n{w}\\S+ ", "", tweet) # in case the url is on a new line
tweet = re.sub(
f"\n{w}\\S+", "", tweet
) # in case the url is alone on a new line
tweet = re.sub(f"{w}\\S+", "", tweet) # any other case?
tweet = re.sub(" +", " ", tweet) # replace multiple spaces with one space
return " ".join(tweet.split()).strip()
def is_boring_tweet(tweet):
"""Check if tweet is boring."""
boring_stuff = ["http", "@", "#"]
not_boring_words = sum(
1
for word in tweet.split()
if not any(bs in word.lower() for bs in boring_stuff)
)
return not_boring_words < 3
def fix_text(text):
text = text.replace("&amp;", "&")
text = text.replace("&lt;", "<")
text = text.replace("&gt;", ">")
return text
def twitter_sentiment_api_score(
tweet_list: list = None, return_argmax: bool = True, use_api=False
):
"""
Sends a list of tweets to the Hugging Face Twitter Sentiment Analysis API and returns a list of sentiment scores for each tweet.
Args:
tweet_list (list): A list of strings, where each string represents a tweet.
return_argmax (bool): Whether to also return the predicted sentiment label with the highest confidence score for each tweet.
Returns:
A list of dictionaries, where each dictionary contains the sentiment scores for a single tweet. Each sentiment score dictionary
contains three key-value pairs: "positive", "neutral", and "negative". The value for each key is a float between 0 and 1 that
represents the confidence score for that sentiment label, where higher values indicate higher confidence in that sentiment. If
`return_argmax` is True, each dictionary will also contain an additional key "argmax" with the predicted sentiment label for
that tweet.
"""
if use_api:
import requests
# URL and authentication header for the Hugging Face Twitter Sentiment Analysis API
API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment"
headers = {"Authorization": "Bearer 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"}
# Function to send a POST request with a JSON payload to the API and return the response as a JSON object
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
# Send a list of tweets to the API and receive a list of sentiment scores for each tweet
output = query(
{
"inputs": tweet_list,
}
)
else:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from scipy.special import softmax
import os
task = "sentiment"
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
def get_sentimet(text):
labels = ["negative", "neutral", "positive"]
# text = "Good night 😊"
text = tweet_cleaner(text)
encoded_input = tokenizer(text, return_tensors="pt")
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
ranking = np.argsort(scores)[::-1]
results = {
labels[ranking[i]]: np.round(float(scores[ranking[i]]), 4)
for i in range(scores.shape[0])
}
max_key = max(results, key=results.get)
results["argmax"] = max_key
return results
return [get_sentimet(t) for t in tweet_list]
# Loop through the list of sentiment scores and replace the sentiment labels with more intuitive labels
result = []
for s in output:
sentiment_dict = {}
for d in s:
if isinstance(d, dict):
if d["label"] == "LABEL_2":
sentiment_dict["positive"] = d["score"]
elif d["label"] == "LABEL_1":
sentiment_dict["neutral"] = d["score"]
elif d["label"] == "LABEL_0":
sentiment_dict["negative"] = d["score"]
if return_argmax and len(sentiment_dict) > 0:
argmax_label = max(sentiment_dict, key=sentiment_dict.get)
sentiment_dict["argmax"] = argmax_label
result.append(sentiment_dict)
# Return a list of dictionaries, where each dictionary contains the sentiment scores for a single tweet
return result