aus10powell commited on
Commit
2a3e5e3
1 Parent(s): 33d6c4f

Update scripts/sentiment.py

Browse files
Files changed (1) hide show
  1. scripts/sentiment.py +84 -19
scripts/sentiment.py CHANGED
@@ -3,7 +3,9 @@ import nltk
3
  from typing import List
4
  from transformers import pipeline
5
  from tqdm import tqdm
6
-
 
 
7
 
8
  def tweet_cleaner(tweet: str) -> str:
9
  # words = set(nltk.corpus.words.words())
@@ -68,30 +70,93 @@ def fix_text(text):
68
  return text
69
 
70
 
71
- def get_tweets_sentiment(tweets: List[str]) -> List[float]:
 
 
72
  """
73
- Takes in a list of tweet texts and returns their sentiment scores as a list of floats between 0 and 1.
74
 
75
- Parameters:
76
- tweets (List[str]): A list of tweet texts.
 
77
 
78
  Returns:
79
- List[float]: A list of sentiment scores for the input tweets, where each score is a float between 0 and 1.
 
 
 
 
80
  """
81
 
82
- # Load the sentiment analysis pipeline
83
- classifier = pipeline(
84
- "sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english"
85
- )
86
 
87
- if type(tweets[0]) == dict:
88
- # Clean tweets
89
- tweet_texts = [tweet_cleaner(t["content"]) for t in tqdm(tweets)]
90
- else:
91
- tweet_texts = [tweet_cleaner(t) for t in tqdm(tweets)]
 
 
 
92
 
93
- # Get tweet sentiment score
94
- tweet_sentiments = classifier(tweet_texts)
 
 
 
 
 
95
 
96
- # Extract the sentiment score from each result and return as a list
97
- return [t["score"] for t in tweet_sentiments]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from typing import List
4
  from transformers import pipeline
5
  from tqdm import tqdm
6
+ import numpy as np
7
+ import numpy as np
8
+ import scipy
9
 
10
  def tweet_cleaner(tweet: str) -> str:
11
  # words = set(nltk.corpus.words.words())
 
70
  return text
71
 
72
 
73
+ def twitter_sentiment_api_score(
74
+ tweet_list: list = None, return_argmax: bool = True, use_api=False
75
+ ):
76
  """
77
+ Sends a list of tweets to the Hugging Face Twitter Sentiment Analysis API and returns a list of sentiment scores for each tweet.
78
 
79
+ Args:
80
+ tweet_list (list): A list of strings, where each string represents a tweet.
81
+ return_argmax (bool): Whether to also return the predicted sentiment label with the highest confidence score for each tweet.
82
 
83
  Returns:
84
+ A list of dictionaries, where each dictionary contains the sentiment scores for a single tweet. Each sentiment score dictionary
85
+ contains three key-value pairs: "positive", "neutral", and "negative". The value for each key is a float between 0 and 1 that
86
+ represents the confidence score for that sentiment label, where higher values indicate higher confidence in that sentiment. If
87
+ `return_argmax` is True, each dictionary will also contain an additional key "argmax" with the predicted sentiment label for
88
+ that tweet.
89
  """
90
 
91
+ if use_api:
92
+ import requests
 
 
93
 
94
+ # URL and authentication header for the Hugging Face Twitter Sentiment Analysis API
95
+ API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment"
96
+ headers = {"Authorization": "Bearer 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"}
97
+
98
+ # Function to send a POST request with a JSON payload to the API and return the response as a JSON object
99
+ def query(payload):
100
+ response = requests.post(API_URL, headers=headers, json=payload)
101
+ return response.json()
102
 
103
+ # Send a list of tweets to the API and receive a list of sentiment scores for each tweet
104
+ output = query(
105
+ {
106
+ "inputs": tweet_list,
107
+ }
108
+ )
109
+ else:
110
 
111
+ from transformers import AutoModelForSequenceClassification
112
+ from transformers import TFAutoModelForSequenceClassification
113
+ from transformers import AutoTokenizer
114
+ from scipy.special import softmax
115
+ import os
116
+
117
+ task = "sentiment"
118
+ MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
119
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
120
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
121
+ # model.save_pretrained(MODEL)
122
+
123
+ def get_sentimet(text):
124
+ labels = ["negative", "neutral", "positive"]
125
+ # text = "Good night 😊"
126
+ text = tweet_cleaner(text)
127
+ encoded_input = tokenizer(text, return_tensors="pt")
128
+ output = model(**encoded_input)
129
+ scores = output[0][0].detach().numpy()
130
+ scores = softmax(scores)
131
+ ranking = np.argsort(scores)[::-1]
132
+ results = {
133
+ labels[ranking[i]]: np.round(float(scores[ranking[i]]), 4)
134
+ for i in range(scores.shape[0])
135
+ }
136
+
137
+ max_key = max(results, key=results.get)
138
+ results["argmax"] = max_key
139
+ return results
140
+
141
+ return [get_sentimet(t) for t in tweet_list]
142
+
143
+ # Loop through the list of sentiment scores and replace the sentiment labels with more intuitive labels
144
+ result = []
145
+ for s in output:
146
+ sentiment_dict = {}
147
+ for d in s:
148
+ if isinstance(d, dict):
149
+ if d["label"] == "LABEL_2":
150
+ sentiment_dict["positive"] = d["score"]
151
+ elif d["label"] == "LABEL_1":
152
+ sentiment_dict["neutral"] = d["score"]
153
+ elif d["label"] == "LABEL_0":
154
+ sentiment_dict["negative"] = d["score"]
155
+ if return_argmax and len(sentiment_dict) > 0:
156
+ argmax_label = max(sentiment_dict, key=sentiment_dict.get)
157
+ sentiment_dict["argmax"] = argmax_label
158
+ result.append(sentiment_dict)
159
+
160
+ # Return a list of dictionaries, where each dictionary contains the sentiment scores for a single tweet
161
+
162
+ return result