Spaces:
Runtime error
Runtime error
added remove duplicates method to csv data
Browse files- data/twitterdata.csv +0 -2
- textclassifier/TextClassifier.py +35 -12
data/twitterdata.csv
CHANGED
@@ -17,5 +17,3 @@ id,tweet,date,user_id,username,urls,nlikes,nreplies,nretweets
|
|
17 |
1194553636946350080,Morgan Johansson måste avgå. #pldebatt #svpol https://t.co/QsVAhqvaou,2019-11-13 10:52:35,95972673,jimmieakesson,[],1713,47,203
|
18 |
1194528503284346881,Idag begär jag att riksdagen avsätter justitieminister Morgan Johansson. #svpol https://t.co/tL703x5eYQ,2019-11-13 09:12:43,95972673,jimmieakesson,[],1844,80,140
|
19 |
1194495733858222080,Åtta år senare och Morgan kämpar vidare... 🦸♂️ https://t.co/iCWrEwhgHP,2019-11-13 07:02:30,95972673,jimmieakesson,[],1769,87,271
|
20 |
-
1193987880609341440,Stefan Löfven kommenterar oppositionens misstroendeförklaring mot justitieministern. Kan vara något av det mest arroganta jag någonsin läst. #avgå https://t.co/MZHSF1eyMm,2019-11-11 21:24:28,95972673,jimmieakesson,[],1946,160,288
|
21 |
-
1193852463390035968,"Det är uppenbart att justitieministern, som i det här fallet är ytterst ansvarig för att klara den här situationen, inte förstår allvaret och inte klarar av att hantera det. https://t.co/YoSQnJiluZ https://t.co/eBkBy1SQdU",2019-11-11 12:26:22,95972673,jimmieakesson,['https://sverigesradio.se/sida/artikel.aspx?programid=83&artikel=7341552'],1657,62,178
|
|
|
17 |
1194553636946350080,Morgan Johansson måste avgå. #pldebatt #svpol https://t.co/QsVAhqvaou,2019-11-13 10:52:35,95972673,jimmieakesson,[],1713,47,203
|
18 |
1194528503284346881,Idag begär jag att riksdagen avsätter justitieminister Morgan Johansson. #svpol https://t.co/tL703x5eYQ,2019-11-13 09:12:43,95972673,jimmieakesson,[],1844,80,140
|
19 |
1194495733858222080,Åtta år senare och Morgan kämpar vidare... 🦸♂️ https://t.co/iCWrEwhgHP,2019-11-13 07:02:30,95972673,jimmieakesson,[],1769,87,271
|
|
|
|
textclassifier/TextClassifier.py
CHANGED
@@ -45,6 +45,13 @@ class TextClassifier:
|
|
45 |
# self.api_key = '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'
|
46 |
openai.api_key = OPENAI_AUTHTOKEN
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
@staticmethod
|
49 |
def cleanup_sentiment_results(classification_unclean):
|
50 |
"""
|
@@ -219,7 +226,7 @@ class TextClassifier:
|
|
219 |
|
220 |
def df_to_csv(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
|
221 |
"""
|
222 |
-
Writes pandas df to csv file. If it already exists, it appends.
|
223 |
:param filename:
|
224 |
:return:
|
225 |
"""
|
@@ -228,18 +235,32 @@ class TextClassifier:
|
|
228 |
else:
|
229 |
self.df.to_csv(filename, mode='a', header=False, index=False)
|
230 |
|
231 |
-
|
|
|
|
|
|
|
232 |
"""
|
233 |
-
|
234 |
-
:param
|
235 |
-
:return:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
"""
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
return row
|
242 |
-
return None
|
243 |
|
244 |
def __repr__(self):
|
245 |
"""
|
@@ -248,7 +269,9 @@ class TextClassifier:
|
|
248 |
"""
|
249 |
return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
|
250 |
|
251 |
-
|
|
|
|
|
252 |
# import pandas as pd
|
253 |
# from datetime import datetime
|
254 |
# import os
|
|
|
45 |
# self.api_key = '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'
|
46 |
openai.api_key = OPENAI_AUTHTOKEN
|
47 |
|
48 |
+
def classify_topic_and_sentiment(self):
|
49 |
+
self.classify_topic_of_tweets()
|
50 |
+
self.classify_sentiment_of_tweets()
|
51 |
+
|
52 |
+
# save the dataframe to a csv file
|
53 |
+
|
54 |
+
|
55 |
@staticmethod
|
56 |
def cleanup_sentiment_results(classification_unclean):
|
57 |
"""
|
|
|
226 |
|
227 |
def df_to_csv(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
|
228 |
"""
|
229 |
+
Writes pandas df to csv file. If it already exists, it appends. If not, it creates. It also removes duplicates.
|
230 |
:param filename:
|
231 |
:return:
|
232 |
"""
|
|
|
235 |
else:
|
236 |
self.df.to_csv(filename, mode='a', header=False, index=False)
|
237 |
|
238 |
+
self.remove_duplicates_from_csv(filename)
|
239 |
+
|
240 |
+
@staticmethod
|
241 |
+
def remove_duplicates_from_csv(filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
|
242 |
"""
|
243 |
+
Removes duplicates from csv file.
|
244 |
+
:param filename: filename of csv file
|
245 |
+
:return: None
|
246 |
+
"""
|
247 |
+
with open(filename, 'r') as f:
|
248 |
+
lines = f.readlines()
|
249 |
+
with open(filename, 'w') as f:
|
250 |
+
for line in lines:
|
251 |
+
if line not in lines[lines.index(line) + 1:]:
|
252 |
+
f.write(line)
|
253 |
+
|
254 |
+
def remove_already_classified_tweets(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
|
255 |
+
"""
|
256 |
+
Removes tweets that have already been classified.
|
257 |
+
:param filename: filename of csv file
|
258 |
+
:return: None
|
259 |
"""
|
260 |
+
df = self.df
|
261 |
+
df = df[df['sentiment'].isnull()]
|
262 |
+
self.df = df
|
263 |
+
self.df_to_csv(filename)
|
|
|
|
|
264 |
|
265 |
def __repr__(self):
|
266 |
"""
|
|
|
269 |
"""
|
270 |
return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
|
271 |
|
272 |
+
if __name__ == "__main__":
|
273 |
+
tc = TextClassifier(from_date="2022-01-01", to_date="2022-05-31", user_name='jimmieakesson', num_tweets=20)
|
274 |
+
tc.remove_duplicates_from_csv()
|
275 |
# import pandas as pd
|
276 |
# from datetime import datetime
|
277 |
# import os
|