Spaces:
Runtime error
Runtime error
MarcusAscard
commited on
Final push
Browse files- textclassifier/TextClassifier.py +606 -217
textclassifier/TextClassifier.py
CHANGED
@@ -1,217 +1,606 @@
|
|
1 |
-
import os
|
2 |
-
import time
|
3 |
-
import warnings
|
4 |
-
|
5 |
-
|
6 |
-
import
|
7 |
-
|
8 |
-
|
9 |
-
from
|
10 |
-
from
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
:param
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
self.
|
41 |
-
self.
|
42 |
-
self.
|
43 |
-
|
44 |
-
|
45 |
-
self.
|
46 |
-
|
47 |
-
#
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
new_item =
|
103 |
-
new_item = new_item.replace("
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
:
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
"""
|
147 |
-
|
148 |
-
|
149 |
-
""
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
"""
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
else
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
return
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import warnings
|
4 |
+
import openai
|
5 |
+
import pandas as pd
|
6 |
+
from dotenv import find_dotenv, load_dotenv
|
7 |
+
from pandas.core.common import SettingWithCopyWarning
|
8 |
+
from twitterscraper import TwitterScraper
|
9 |
+
from sentence_transformers import SentenceTransformer
|
10 |
+
from scipy import spatial
|
11 |
+
from datetime import date, timedelta
|
12 |
+
|
13 |
+
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
|
14 |
+
|
15 |
+
# Set one directory up into ROOT_PATH
|
16 |
+
ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
17 |
+
|
18 |
+
dotenv_path = find_dotenv()
|
19 |
+
load_dotenv(dotenv_path)
|
20 |
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
21 |
+
|
22 |
+
|
23 |
+
class TextClassifier:
|
24 |
+
def __init__(self, model_name="text-davinci-002", from_date='2022-01-01', to_date=str(date.today()),
|
25 |
+
user_list=['jimmieakesson'],
|
26 |
+
num_tweets=20):
|
27 |
+
"""
|
28 |
+
Initializes the TextClassifier.
|
29 |
+
:param model_name: name of the model from openai.
|
30 |
+
:param from_date: string of the format 'YYYY-MM-DD'.
|
31 |
+
:param to_date: string of the format 'YYYY-MM-DD'.
|
32 |
+
:param num_tweets: integer value of the maximum number of tweets to be scraped.
|
33 |
+
"""
|
34 |
+
# Make sure user_name is not empty
|
35 |
+
assert user_list is not None, "user_name cannot be empty"
|
36 |
+
|
37 |
+
self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
|
38 |
+
self.model_name = model_name
|
39 |
+
self.from_date = from_date
|
40 |
+
self.to_date = to_date
|
41 |
+
self.num_tweets = num_tweets
|
42 |
+
self.user_name = user_list
|
43 |
+
# Assure that scrape_by_user actually gets num_tweets
|
44 |
+
# add timer in time-loop and stop after 10 seconds
|
45 |
+
# self.df = self.ts.scrape_by_user(user_name)
|
46 |
+
self.df = self.ts.scrape_by_several_users(user_list)
|
47 |
+
# Make id as type int64
|
48 |
+
self.df.loc[:, 'id'] = self.df.id.copy().apply(lambda x: int(x))
|
49 |
+
openai.api_key = OPENAI_API_KEY
|
50 |
+
|
51 |
+
def classify_all(self, tweet: str):
|
52 |
+
"""
|
53 |
+
Classifies the topic, subtopic, sentiment and target of a user's tweets.
|
54 |
+
"""
|
55 |
+
import os
|
56 |
+
import openai
|
57 |
+
|
58 |
+
valid_tweet = len(tweet.split()) > 4
|
59 |
+
if valid_tweet:
|
60 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
61 |
+
promptstring = "Decide a Tweet's political TOPIC and SUBTOPIC, without classifying it as 'politics'. Also " \
|
62 |
+
"decide whether a political Tweet's " \
|
63 |
+
"SENTIMENT is " \
|
64 |
+
"positive, " \
|
65 |
+
"negative or neutral. Also give the TARGET of the sentiment. \nGive the answer in the form ' (" \
|
66 |
+
"TOPIC, SUBTOPIC, SENTIMENT, TARGET)'\n\nTweet: {} \nAnswer: ".format(tweet)
|
67 |
+
response = openai.Completion.create(
|
68 |
+
model="text-davinci-002",
|
69 |
+
prompt=promptstring,
|
70 |
+
temperature=0,
|
71 |
+
max_tokens=30,
|
72 |
+
top_p=1,
|
73 |
+
frequency_penalty=0.5,
|
74 |
+
presence_penalty=0
|
75 |
+
)
|
76 |
+
classification_unclean = response.choices[0]['text']
|
77 |
+
classification_clean = self.cleanup_topic_results(classification_unclean)
|
78 |
+
if classification_clean.lower() == "(topic, subtopic, sentiment, target)":
|
79 |
+
classification_clean = "(none, none, none, none)"
|
80 |
+
else:
|
81 |
+
classification_clean = "(none, none, none, none)"
|
82 |
+
return classification_clean.lower()
|
83 |
+
|
84 |
+
def classify_all_list(self):
|
85 |
+
"""
|
86 |
+
Classifies the topics of a user's tweets.
|
87 |
+
"""
|
88 |
+
df_topic = self.df.copy()
|
89 |
+
df_topic['class_tuple'] = df_topic['tweet'].apply(self.classify_all)
|
90 |
+
self.df = df_topic
|
91 |
+
self.split_tuple_into_columns()
|
92 |
+
return self.df
|
93 |
+
|
94 |
+
@staticmethod
|
95 |
+
def cleanup_topic_results(text):
|
96 |
+
"""
|
97 |
+
Cleanup response from GPT-3 to a string matching the format: "(main_topic, sub_topic, sentiment, target)"
|
98 |
+
:param text: GPT-3 response
|
99 |
+
:return: A string on the format: "(main_topic, sub_topic, sentiment, target)"
|
100 |
+
"""
|
101 |
+
new_item = text.strip()
|
102 |
+
new_item = new_item.replace("\n", "")
|
103 |
+
new_item = new_item.replace(" ", "")
|
104 |
+
item_control = new_item.replace("(", "")
|
105 |
+
item_control = item_control.replace(")", "")
|
106 |
+
item_control = item_control.split(",")
|
107 |
+
if ' ' or '' in item_control:
|
108 |
+
item_control = [s.strip() if not (s == ' ' or s == '') else 'none' for s in
|
109 |
+
item_control] # Replace empty classifications with 'none'
|
110 |
+
diff = 4 - len(item_control)
|
111 |
+
if diff < 0: # If response gave more than four predictions
|
112 |
+
cutout = item_control[diff - 1:] # Cut out the superflous predictions
|
113 |
+
item_control = item_control[:diff - 1] # Save the rest
|
114 |
+
new_s = ""
|
115 |
+
for i in range(len(cutout)):
|
116 |
+
new_s += cutout[i]
|
117 |
+
if i < -diff:
|
118 |
+
new_s += " and " # Merge superflous predictions. E.g. target = 's', 'mp', 'v' -> target = 's and mp and v'
|
119 |
+
item_control.append(new_s)
|
120 |
+
elif diff > 0: # If response gave less than four predictions
|
121 |
+
for i in range(diff):
|
122 |
+
item_control.append("none") # Fill out tuple with nones
|
123 |
+
new_item = str(tuple(item_control))
|
124 |
+
new_item = new_item.replace("'", "")
|
125 |
+
return new_item
|
126 |
+
|
127 |
+
def df_to_csv(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
|
128 |
+
"""
|
129 |
+
Writes pandas df to csv file. If it already exists, it appends. If not, it creates. It also removes duplicates.
|
130 |
+
:param filename:
|
131 |
+
:return:
|
132 |
+
"""
|
133 |
+
if not os.path.exists(filename):
|
134 |
+
self.df.to_csv(filename, index=False)
|
135 |
+
else:
|
136 |
+
self.df.to_csv(filename, mode='a', header=False, index=False)
|
137 |
+
|
138 |
+
self.remove_duplicates_from_csv(filename)
|
139 |
+
|
140 |
+
@staticmethod
|
141 |
+
def remove_duplicates_from_csv(filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
|
142 |
+
"""
|
143 |
+
Removes duplicates from csv file.
|
144 |
+
:param filename: filename of csv file
|
145 |
+
:return: None
|
146 |
+
"""
|
147 |
+
with open(filename, 'r', encoding="utf8") as f:
|
148 |
+
lines = f.readlines()
|
149 |
+
with open(filename, 'w', encoding="utf8") as f:
|
150 |
+
for line in lines:
|
151 |
+
if line not in lines[lines.index(line) + 1:]:
|
152 |
+
f.write(line)
|
153 |
+
|
154 |
+
def remove_already_classified_tweets(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
|
155 |
+
"""
|
156 |
+
Removes tweets that have already been classified.
|
157 |
+
:param filename: filename of csv file
|
158 |
+
:return: None
|
159 |
+
"""
|
160 |
+
df = self.df
|
161 |
+
df = df[df['sentiment'].isnull()]
|
162 |
+
self.df = df
|
163 |
+
self.df_to_csv(filename)
|
164 |
+
|
165 |
+
def split_tuple_into_columns(self):
|
166 |
+
"""
|
167 |
+
Splits the topics (topic, subtopic, sentiment, target) into columns.
|
168 |
+
:return: None
|
169 |
+
"""
|
170 |
+
df_topic = self.df.copy()
|
171 |
+
df_topic['topics_temp'] = df_topic['class_tuple'].apply(lambda x: tuple(x[1:-1].split(",")))
|
172 |
+
df_topic_split = pd.DataFrame(df_topic['topics_temp'].tolist(),
|
173 |
+
columns=['main_topic', 'sub_topic', 'sentiment', 'target'])
|
174 |
+
# Manually add columns to self.df
|
175 |
+
self.df['main_topic'] = df_topic_split['main_topic'].tolist()
|
176 |
+
self.df['main_topic'] = self.df['main_topic'].replace(["n/a", "", " "], "none", regex=True)
|
177 |
+
self.df['main_topic'] = self.df['main_topic'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
|
178 |
+
|
179 |
+
self.df['sub_topic'] = df_topic_split['sub_topic'].tolist()
|
180 |
+
# In a few of the outputs from GPT-3 the sub_topic = "sentiment"
|
181 |
+
self.df['sub_topic'] = self.df['sub_topic'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
|
182 |
+
self.df['sub_topic'] = self.df['sub_topic'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
|
183 |
+
|
184 |
+
self.df['sentiment'] = df_topic_split['sentiment'].tolist()
|
185 |
+
self.df['sentiment'] = self.df['sentiment'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
|
186 |
+
self.df['sentiment'] = self.df['sentiment'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
|
187 |
+
|
188 |
+
self.df['target'] = df_topic_split['target'].tolist()
|
189 |
+
self.df['target'] = self.df['target'].replace(["n/a", "", " "], "none", regex=True)
|
190 |
+
self.df['target'] = self.df['target'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
|
191 |
+
|
192 |
+
self.df.fillna('none', inplace=True)
|
193 |
+
|
194 |
+
def get_dataframe(self):
|
195 |
+
"""
|
196 |
+
Returns the dataframe.
|
197 |
+
:return: dataframe
|
198 |
+
"""
|
199 |
+
return self.df
|
200 |
+
|
201 |
+
def __repr__(self):
|
202 |
+
"""
|
203 |
+
Gives a string that describes which user is classified
|
204 |
+
:return:
|
205 |
+
"""
|
206 |
+
return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
|
207 |
+
|
208 |
+
def get_database(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
|
209 |
+
"""
|
210 |
+
Returns the database containing all dataframes.
|
211 |
+
:param filename: filename of csv file
|
212 |
+
:return:
|
213 |
+
"""
|
214 |
+
db = pd.read_csv(filename)
|
215 |
+
return db
|
216 |
+
|
217 |
+
def cleanup_list(self, uncleaned_list):
|
218 |
+
"""
|
219 |
+
Cleans up faulty predictions.
|
220 |
+
:param uncleaned_list: the list to be cleaned
|
221 |
+
:return: cleaned list
|
222 |
+
"""
|
223 |
+
uncleaned_list = [s if not isinstance(s, float) else "none" for s in uncleaned_list]
|
224 |
+
uncleaned_list = [s if not len(s.split()) > 5 else "none" for s in uncleaned_list]
|
225 |
+
uncleaned_list = [s if not "swedish" in s else s.replace("swedish", " ") for s in uncleaned_list]
|
226 |
+
uncleaned_list = [s if not "politics" in s else s.replace("politics", "none") for s in uncleaned_list]
|
227 |
+
uncleaned_list = [s.replace(" ", " ") for s in uncleaned_list]
|
228 |
+
cleaned_list = [s.strip() for s in uncleaned_list]
|
229 |
+
return cleaned_list
|
230 |
+
|
231 |
+
def merge_lists(self, main_topic_list, sub_topic_list):
|
232 |
+
"""
|
233 |
+
Merges the topic lists. If either topic is a faulty classification, only the non-faulty topic wil be used.
|
234 |
+
If both are faulty, the merged topic will be labeled as faulty (ERROR_496).
|
235 |
+
:param main_topic_list: A list containing main topics
|
236 |
+
:param sub_topic_list: A list containing sub topics
|
237 |
+
:return: A list containing string items on the form "main_topic and sub_topic"
|
238 |
+
"""
|
239 |
+
new_list = []
|
240 |
+
main_topic_list = self.clean_party_names(main_topic_list)
|
241 |
+
sub_topic_list = self.clean_party_names(sub_topic_list)
|
242 |
+
for i in range(len(main_topic_list)):
|
243 |
+
if main_topic_list[i].lower() == "none" and sub_topic_list[i].lower() == "none": # If the predictions are faulty
|
244 |
+
new_list.append("ERROR_496") # Label as ERROR_496 (faulty prediction)
|
245 |
+
elif main_topic_list[i].lower() == "none":
|
246 |
+
new_list.append(sub_topic_list[i])
|
247 |
+
elif sub_topic_list[i].lower() == "none":
|
248 |
+
new_list.append(main_topic_list[i])
|
249 |
+
else:
|
250 |
+
new_list.append(main_topic_list[i] + " and " + sub_topic_list[i])
|
251 |
+
return new_list
|
252 |
+
|
253 |
+
def file_to_mat(self, classification_type):
|
254 |
+
"""
|
255 |
+
Converts a synonym textfile to a matrix in which the rows contain a general topic/target and its related words.
|
256 |
+
:param classification_type: The type of classification: topic or target
|
257 |
+
:return: a matrix in which the first element of each row is a general topic/target, and the rest are words related to
|
258 |
+
the topic
|
259 |
+
"""
|
260 |
+
filename = "{}/data/".format(ROOT_PATH)
|
261 |
+
filename += classification_type + "_synonyms.txt"
|
262 |
+
with open(filename, encoding='utf-8') as f:
|
263 |
+
lines = f.read()
|
264 |
+
lines = lines.split("\n")
|
265 |
+
|
266 |
+
topic_list = []
|
267 |
+
temp_list = []
|
268 |
+
|
269 |
+
for topic in lines:
|
270 |
+
if not topic.endswith("####"):
|
271 |
+
temp_list.append(topic)
|
272 |
+
else:
|
273 |
+
temp_list.append(topic[:-4]) # Remove the marker (####)
|
274 |
+
topic_list.append(temp_list)
|
275 |
+
temp_list = []
|
276 |
+
|
277 |
+
return topic_list
|
278 |
+
|
279 |
+
def mat_to_list(self, mat):
|
280 |
+
"""
|
281 |
+
Converts a matrix from file_to_mat() into one list containing all topics and synonyms, and one list with
|
282 |
+
mappings for the synonyms.
|
283 |
+
:param mat: a matrix from file_to_mat()
|
284 |
+
:return:
|
285 |
+
"""
|
286 |
+
full_list = []
|
287 |
+
mapped_synonyms = []
|
288 |
+
for syns in mat:
|
289 |
+
for topic in syns:
|
290 |
+
full_list.append(topic)
|
291 |
+
mapped_synonyms.append(syns[0])
|
292 |
+
return full_list, mapped_synonyms
|
293 |
+
|
294 |
+
def clean_party_names(self, old_topic_list):
|
295 |
+
"""
|
296 |
+
Encodes all party names to sentences that will yield a high cosine similarity value when merged with another
|
297 |
+
topic, without taking the actual party name into account. These sentences have deliberately been composed such
|
298 |
+
that they pose a low risk of being close (in the sentence embedding-space) to any possible merged topic or
|
299 |
+
target that may be encountered.
|
300 |
+
:param old_topic_list: list of topics
|
301 |
+
:return: list of encoded topics
|
302 |
+
"""
|
303 |
+
# Problem 1: When a party name is encountered, we want to bias the merging towards that party since the
|
304 |
+
# occurrence of a very general main topic (as in the example below) plus a party name as subtopic is frequent.
|
305 |
+
# Example: main_topic = "politics", sub_topic = "sweden democrats" ->
|
306 |
+
# combined_topics = "politics and sweden democrats"
|
307 |
+
# Problem 2: The party names themselves are biased towards certain topics/targets and lead to faulty merges.
|
308 |
+
# Example: Variations of words such as "Sweden" as the target/topic will be biased towards getting merged with
|
309 |
+
# "Sweden Democrats".
|
310 |
+
# Solution: Encode party names with sentences that are HIGHLY unlikely to be close to anything in the embedding
|
311 |
+
# space and thus enforcing a strong bias in the cosine similarity computation towards the party if encountered.
|
312 |
+
|
313 |
+
party_names = {}
|
314 |
+
party_names["m"] = "parrot computer is swimming as screen time"
|
315 |
+
party_names["moderaterna"] = "parrot computer is swimming as screen time"
|
316 |
+
party_names["moderates"] = "parrot computer is swimming as screen time"
|
317 |
+
party_names["the moderates"] = "parrot computer is swimming as screen time"
|
318 |
+
party_names["moderate party"] = "parrot computer is swimming as screen time"
|
319 |
+
party_names["the moderate party"] = "parrot computer is swimming as screen time"
|
320 |
+
party_names["the moderaterna party"] = "parrot computer is swimming as screen time"
|
321 |
+
|
322 |
+
party_names["sd"] = "keyboard can hire the yellow elephant in cosmos"
|
323 |
+
party_names["sverigedemokraterna"] = "keyboard can hire the yellow elephant in cosmos"
|
324 |
+
party_names["sweden democrats"] = "keyboard can hire the yellow elephant in cosmos"
|
325 |
+
party_names["the sweden democrats"] = "keyboard can hire the yellow elephant in cosmos"
|
326 |
+
party_names["the swedish democrats"] = "keyboard can hire the yellow elephant in cosmos"
|
327 |
+
party_names["swedish democrats"] = "keyboard can hire the yellow elephant in cosmos"
|
328 |
+
party_names["@jimmieakesson"] = "keyboard can hire the yellow elephant in cosmos"
|
329 |
+
|
330 |
+
party_names["l"] = "red weather jokes with music and the mathematician"
|
331 |
+
party_names["liberalerna"] = "red weather jokes with music and the mathematician"
|
332 |
+
party_names["liberals"] = "red weather jokes with music and the mathematician"
|
333 |
+
party_names["the liberals"] = "red weather jokes with music and the mathematician"
|
334 |
+
party_names["the liberal party"] = "red weather jokes with music and the mathematician"
|
335 |
+
party_names["liberal people's party"] = "red weather jokes with music and the mathematician"
|
336 |
+
party_names["@johanpehrson"] = "red weather jokes with music and the mathematician"
|
337 |
+
|
338 |
+
party_names["mp"] = "ice piano flies with pencil as direction"
|
339 |
+
party_names["miljöpartiet"] = "ice piano flies with pencil as direction"
|
340 |
+
party_names["de gröna"] = "ice piano flies with pencil as direction"
|
341 |
+
party_names["green party"] = "ice piano flies with pencil as direction"
|
342 |
+
party_names["the green party"] = "ice piano flies with pencil as direction"
|
343 |
+
party_names["miljopartiet"] = "ice piano flies with pencil as direction"
|
344 |
+
party_names["@bolund"] = "ice piano flies with pencil as direction"
|
345 |
+
party_names["@martastenevi"] = "ice piano flies with pencil as direction"
|
346 |
+
|
347 |
+
party_names["s"] = "lamp of fire walks bird gladly tomorrow"
|
348 |
+
party_names["socialdemokraterna"] = "lamp of fire walks bird gladly tomorrow"
|
349 |
+
party_names["social democratic party"] = "lamp of fire walks bird gladly tomorrow"
|
350 |
+
party_names["the social democratic party"] = "lamp of fire walks bird gladly tomorrow"
|
351 |
+
party_names["social democrats"] = "lamp of fire walks bird gladly tomorrow"
|
352 |
+
party_names["the social democrats"] = "lamp of fire walks bird gladly tomorrow"
|
353 |
+
party_names["sosse"] = "lamp of fire walks bird gladly tomorrow"
|
354 |
+
party_names["sossen"] = "lamp of fire walks bird gladly tomorrow"
|
355 |
+
party_names["sossar"] = "lamp of fire walks bird gladly tomorrow"
|
356 |
+
party_names["sossarna"] = "lamp of fire walks bird gladly tomorrow"
|
357 |
+
party_names["sossarnas"] = "lamp of fire walks bird gladly tomorrow"
|
358 |
+
party_names["swedish social democrats"] = "lamp of fire walks bird gladly tomorrow"
|
359 |
+
party_names["@swedishpm"] = "lamp of fire walks bird gladly tomorrow"
|
360 |
+
|
361 |
+
party_names["v"] = "rooftop cats play physics with cardboard fire"
|
362 |
+
party_names["vänsterpartiet"] = "rooftop cats play physics with cardboard fire"
|
363 |
+
party_names["left party"] = "rooftop cats play physics with cardboard fire"
|
364 |
+
party_names["the left party"] = "rooftop cats play physics with cardboard fire"
|
365 |
+
party_names["@dadgostarnooshi"] = "rooftop cats play physics with cardboard fire"
|
366 |
+
|
367 |
+
party_names["c"] = "differential donuts program sunny waters"
|
368 |
+
party_names["centerpartiet"] = "differential donuts program sunny waters"
|
369 |
+
party_names["center party"] = "differential donuts program sunny waters"
|
370 |
+
party_names["centre party"] = "differential donuts program sunny waters"
|
371 |
+
party_names["the center party"] = "differential donuts program sunny waters"
|
372 |
+
party_names["@annieloof"] = "differential donuts program sunny waters"
|
373 |
+
|
374 |
+
party_names["kd"] = "cauchy-riemann met sunglasses after rolling yellow"
|
375 |
+
party_names["kristdemokraterna"] = "cauchy-riemann met sunglasses after rolling yellow"
|
376 |
+
party_names["christian democrats"] = "cauchy-riemann met sunglasses after rolling yellow"
|
377 |
+
party_names["the christian democrats"] = "cauchy-riemann met sunglasses after rolling yellow"
|
378 |
+
party_names["@buschebba"] = "cauchy-riemann met sunglasses after rolling yellow"
|
379 |
+
|
380 |
+
for i, topic in enumerate(old_topic_list):
|
381 |
+
topic = topic.lower()
|
382 |
+
topic = topic.replace(" ", " ")
|
383 |
+
topic = topic.strip()
|
384 |
+
if topic in party_names:
|
385 |
+
old_topic_list[i] = party_names.get(topic)
|
386 |
+
|
387 |
+
return old_topic_list
|
388 |
+
|
389 |
+
def reset_party_names(self, old_topic_list):
|
390 |
+
"""
|
391 |
+
Decodes the encoded party names.
|
392 |
+
:param old_topic_list: list of topics
|
393 |
+
:return: list of encoded topics
|
394 |
+
"""
|
395 |
+
party_names = {}
|
396 |
+
party_names["m"] = "parrot computer is swimming as screen time"
|
397 |
+
party_names["sd"] = "keyboard can hire the yellow elephant in cosmos"
|
398 |
+
party_names["l"] = "red weather jokes with music and the mathematician"
|
399 |
+
party_names["mp"] = "ice piano flies with pencil as direction"
|
400 |
+
party_names["s"] = "lamp of fire walks bird gladly tomorrow"
|
401 |
+
party_names["v"] = "rooftop cats play physics with cardboard fire"
|
402 |
+
party_names["c"] = "differential donuts program sunny waters"
|
403 |
+
party_names["kd"] = "cauchy-riemann met sunglasses after rolling yellow"
|
404 |
+
inverted_dict = {}
|
405 |
+
# Invert dictionary
|
406 |
+
for k, v in party_names.items():
|
407 |
+
if v not in inverted_dict:
|
408 |
+
inverted_dict[v] = k
|
409 |
+
# Update values in old_topic_list
|
410 |
+
for i, topic in enumerate(old_topic_list):
|
411 |
+
if topic in inverted_dict.keys():
|
412 |
+
old_topic_list[i] = inverted_dict.get(topic)
|
413 |
+
|
414 |
+
return old_topic_list
|
415 |
+
|
416 |
+
def merge_classifications(self, old_list, classification_type):
|
417 |
+
"""
|
418 |
+
Merges topics/targets from GPT-3 according to a list of predefined topics/targets.
|
419 |
+
:param old_list: list of the topics/targets to be merged
|
420 |
+
:param classification_type: type of classifications: topic or target
|
421 |
+
:return: list of new topics/targets
|
422 |
+
"""
|
423 |
+
# Get the tuple of lists containing all synonyms and general topics/targets
|
424 |
+
tup_list = self.mat_to_list(self.file_to_mat(classification_type))
|
425 |
+
# Save list of synonyms
|
426 |
+
synonym_list = tup_list[0]
|
427 |
+
# Save list of mappings between synonym and general topic/target
|
428 |
+
synonym_mappings = tup_list[1]
|
429 |
+
# Load embedding model-names
|
430 |
+
model_list = ['sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', 'all-MiniLM-L6-v2']
|
431 |
+
result_dict = {}
|
432 |
+
# Encode party names
|
433 |
+
old_list = self.clean_party_names(old_list)
|
434 |
+
for model_name in model_list:
|
435 |
+
model = SentenceTransformer(model_name)
|
436 |
+
# Encode the topics/targets with the sentence transformer model
|
437 |
+
old_list_embeddings = model.encode(old_list, batch_size=64, show_progress_bar=True,
|
438 |
+
convert_to_tensor=True)
|
439 |
+
# Encode the synonyms with the sentence transformer model
|
440 |
+
synonym_list_embeddings = model.encode(synonym_list, batch_size=64, show_progress_bar=True,
|
441 |
+
convert_to_tensor=True)
|
442 |
+
for i, embedded_classification in enumerate(old_list_embeddings):
|
443 |
+
result_list = []
|
444 |
+
for embedded_synonyms in synonym_list_embeddings:
|
445 |
+
# Compute the cosine similarity between every classification and synonym
|
446 |
+
result = 1 - spatial.distance.cosine(embedded_classification, embedded_synonyms)
|
447 |
+
result_list.append(result)
|
448 |
+
max_value = max(result_list)
|
449 |
+
max_index = result_list.index(max_value)
|
450 |
+
old_classification = old_list[i]
|
451 |
+
# Extract the general topic/target
|
452 |
+
new_classification = synonym_mappings[max_index]
|
453 |
+
# Save the topic/target that yielded the highest cosine similarity value
|
454 |
+
if old_classification not in result_dict:
|
455 |
+
result_dict[old_classification] = [(new_classification, max_value, synonym_list[max_index])]
|
456 |
+
# When we have found the best topics/targets after using the first transformer model
|
457 |
+
else:
|
458 |
+
# Append the results from the next model
|
459 |
+
result_dict[old_classification].append((new_classification, max_value, synonym_list[max_index]))
|
460 |
+
|
461 |
+
new_dict = {}
|
462 |
+
# Time to replace the old values with the new ones
|
463 |
+
for old_values in result_dict:
|
464 |
+
tup_list = result_dict[old_values]
|
465 |
+
max_tup = max(tup_list, key=lambda item: item[1])
|
466 |
+
if classification_type == "topic":
|
467 |
+
limit = 0.4
|
468 |
+
else:
|
469 |
+
limit = 0.75
|
470 |
+
# Discard classification if the old topic/target is not similar to anything in our synonym lists
|
471 |
+
if max_tup[1] < limit:
|
472 |
+
max_tup = ("ERROR_9000", "{:.2f}".format(round(max_tup[1], 2)), "none")
|
473 |
+
else:
|
474 |
+
max_tup = (max_tup[0], "{:.2f}".format(round(max_tup[1], 2)), max_tup[2])
|
475 |
+
new_classification = max_tup
|
476 |
+
if old_values not in new_dict:
|
477 |
+
new_dict[old_values] = new_classification
|
478 |
+
new_list = []
|
479 |
+
for old_value in old_list:
|
480 |
+
new_list.append(new_dict[old_value])
|
481 |
+
return new_list
|
482 |
+
|
483 |
+
def merge_all(self):
|
484 |
+
"""
|
485 |
+
Merges main+subtopics, targets, and updates the dataframe.
|
486 |
+
:param df:
|
487 |
+
:return:
|
488 |
+
"""
|
489 |
+
df_topics = self.df.copy()
|
490 |
+
|
491 |
+
sub_topics = df_topics['sub_topic']
|
492 |
+
sub_topics = sub_topics.tolist()
|
493 |
+
sub_topics = self.cleanup_list(sub_topics)
|
494 |
+
|
495 |
+
main_topics = df_topics['main_topic']
|
496 |
+
main_topics = main_topics.tolist()
|
497 |
+
main_topics = self.cleanup_list(main_topics)
|
498 |
+
|
499 |
+
merged_topic_list = self.merge_lists(main_topics, sub_topics)
|
500 |
+
|
501 |
+
targets = df_topics['target']
|
502 |
+
targets = targets.tolist()
|
503 |
+
targets = self.cleanup_list(targets)
|
504 |
+
|
505 |
+
merged_topics = self.merge_classifications(merged_topic_list, "topic")
|
506 |
+
merged_targets = self.merge_classifications(targets, "target")
|
507 |
+
|
508 |
+
print("The following merges were made: ")
|
509 |
+
for i, top in enumerate(merged_topic_list):
|
510 |
+
print("TOPICS: ", top, " -> ", merged_topics[i])
|
511 |
+
|
512 |
+
t_list = []
|
513 |
+
for i in range(len(merged_topics)):
|
514 |
+
t_list.append(tuple(merged_topics[i]) + tuple(merged_targets[i]))
|
515 |
+
merged_tuples = t_list
|
516 |
+
df_topics['merged_tuple'] = merged_tuples
|
517 |
+
|
518 |
+
df = self.split_merged_tuple_into_columns(df_topics)
|
519 |
+
print("Merging finished...")
|
520 |
+
self.df = df
|
521 |
+
|
522 |
+
def split_merged_tuple_into_columns(self, df):
|
523 |
+
"""
|
524 |
+
Splits the merged tuple (merged topic, merged target) into columns.
|
525 |
+
:return: None
|
526 |
+
"""
|
527 |
+
df_topic = df.copy()
|
528 |
+
df_topic_split = pd.DataFrame(df_topic['merged_tuple'].tolist(),
|
529 |
+
columns=['merged_topic', 'cos_sim_topic', 'synonym_topic', 'merged_target', 'cos_sim_target', 'synonym_target'])
|
530 |
+
self.df['merged_tuple'] = df_topic['merged_tuple'].tolist()
|
531 |
+
# Manually add columns to self.df
|
532 |
+
self.df['merged_topic'] = df_topic_split['merged_topic'].tolist()
|
533 |
+
self.df['cos_sim_topic'] = df_topic_split['cos_sim_topic'].tolist()
|
534 |
+
self.df['synonym_topic'] = self.reset_party_names(df_topic_split['synonym_topic'].tolist())
|
535 |
+
self.df['merged_target'] = df_topic_split['merged_target'].tolist()
|
536 |
+
self.df['cos_sim_target'] = df_topic_split['cos_sim_target'].tolist()
|
537 |
+
self.df['synonym_target'] = self.reset_party_names(df_topic_split['synonym_target'].tolist())
|
538 |
+
|
539 |
+
return self.df
|
540 |
+
|
541 |
+
def run_main_pipeline(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
|
542 |
+
"""
|
543 |
+
Classifies the topics/sentiments of a user's tweets.
|
544 |
+
#We presume that all tweets inside the twitterdata.csv file are already classified.
|
545 |
+
:return: None
|
546 |
+
"""
|
547 |
+
# Check if file exists, if not, create it
|
548 |
+
if os.path.exists(filename):
|
549 |
+
# Fetch tweets from csv file
|
550 |
+
already_classified_df = pd.read_csv(filename, on_bad_lines='skip')
|
551 |
+
print("Already classified tweets: {}".format(already_classified_df.shape[0]))
|
552 |
+
# Create a temporary df where values from already_classified_df that are not it self.df are stored
|
553 |
+
temp_df = already_classified_df[already_classified_df['id'].isin(self.df['id'])]
|
554 |
+
# Remove rows from self.df that are not in already_classified_df
|
555 |
+
self.df = self.df[~self.df['id'].isin(already_classified_df['id'])]
|
556 |
+
# Only classify non-empty rows
|
557 |
+
if self.df.shape[0] > 0:
|
558 |
+
time.sleep(10)
|
559 |
+
print("Classifying topic, subtopic, sentiment and target of {} tweets...".format(self.df.shape[0]))
|
560 |
+
self.df = self.classify_all_list()
|
561 |
+
self.df = self.df.replace({'': 'none'}, regex=True)
|
562 |
+
self.df = self.df.replace({' ': 'none'}, regex=True)
|
563 |
+
print("Merging topics...")
|
564 |
+
self.merge_all()
|
565 |
+
print("Writing to csv...")
|
566 |
+
self.df_to_csv(filename)
|
567 |
+
# Concatenate temp_df and self.df
|
568 |
+
self.df = pd.concat([temp_df, self.df], ignore_index=True)
|
569 |
+
print("Appended {}.".format(filename))
|
570 |
+
return None
|
571 |
+
else:
|
572 |
+
self.df = pd.concat([temp_df, self.df], ignore_index=True)
|
573 |
+
print("No new tweets to classify.")
|
574 |
+
return None
|
575 |
+
else:
|
576 |
+
print("No csv file found. Continuing without removing already classified tweets.")
|
577 |
+
print("Classifying topic, subtopic, sentiment and target of {} tweets...".format(self.df.shape[0]))
|
578 |
+
self.df = self.classify_all_list()
|
579 |
+
self.df = self.df.replace({'': 'none'}, regex=True)
|
580 |
+
self.df = self.df.replace({' ': 'none'}, regex=True)
|
581 |
+
print("Merging topics...")
|
582 |
+
self.merge_all()
|
583 |
+
print("Writing to csv file...")
|
584 |
+
self.df_to_csv(filename)
|
585 |
+
print("Created {}.".format(filename))
|
586 |
+
return None
|
587 |
+
|
588 |
+
|
589 |
+
if __name__ == "__main__":
|
590 |
+
# $6.39 @ 3431 tweets
|
591 |
+
# $18.00 @ 4608 tweets
|
592 |
+
# $11.61 to classify 1177 tweets ~ $0.01 / tweet
|
593 |
+
|
594 |
+
# This code snippet allows for scraping and classifying by simply specifying a start and end date.
|
595 |
+
USER_LIST = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
|
596 |
+
'dadgostarnooshi']
|
597 |
+
start_date = date(2022, 8, 4)
|
598 |
+
end_date = date(2022, 8, 4)
|
599 |
+
delta = timedelta(days=1)
|
600 |
+
while start_date <= end_date:
|
601 |
+
from_date = start_date.strftime("%Y-%m-%d")
|
602 |
+
start_date += delta
|
603 |
+
to_date = start_date.strftime("%Y-%m-%d")
|
604 |
+
print("curr_date: ", from_date)
|
605 |
+
tc = TextClassifier(from_date=from_date, to_date=to_date, user_list=USER_LIST, num_tweets=6000)
|
606 |
+
tc.run_main_pipeline()
|