Demea9000 commited on
Commit
456b287
1 Parent(s): 12597ef

added remove duplicates method to csv data

Browse files
data/twitterdata.csv CHANGED
@@ -17,5 +17,3 @@ id,tweet,date,user_id,username,urls,nlikes,nreplies,nretweets
17
  1194553636946350080,Morgan Johansson måste avgå. #pldebatt #svpol https://t.co/QsVAhqvaou,2019-11-13 10:52:35,95972673,jimmieakesson,[],1713,47,203
18
  1194528503284346881,Idag begär jag att riksdagen avsätter justitieminister Morgan Johansson. #svpol https://t.co/tL703x5eYQ,2019-11-13 09:12:43,95972673,jimmieakesson,[],1844,80,140
19
  1194495733858222080,Åtta år senare och Morgan kämpar vidare... 🦸‍♂️ https://t.co/iCWrEwhgHP,2019-11-13 07:02:30,95972673,jimmieakesson,[],1769,87,271
20
- 1193987880609341440,Stefan Löfven kommenterar oppositionens misstroendeförklaring mot justitieministern. Kan vara något av det mest arroganta jag någonsin läst. #avgå https://t.co/MZHSF1eyMm,2019-11-11 21:24:28,95972673,jimmieakesson,[],1946,160,288
21
- 1193852463390035968,"Det är uppenbart att justitieministern, som i det här fallet är ytterst ansvarig för att klara den här situationen, inte förstår allvaret och inte klarar av att hantera det. https://t.co/YoSQnJiluZ https://t.co/eBkBy1SQdU",2019-11-11 12:26:22,95972673,jimmieakesson,['https://sverigesradio.se/sida/artikel.aspx?programid=83&artikel=7341552'],1657,62,178
 
17
  1194553636946350080,Morgan Johansson måste avgå. #pldebatt #svpol https://t.co/QsVAhqvaou,2019-11-13 10:52:35,95972673,jimmieakesson,[],1713,47,203
18
  1194528503284346881,Idag begär jag att riksdagen avsätter justitieminister Morgan Johansson. #svpol https://t.co/tL703x5eYQ,2019-11-13 09:12:43,95972673,jimmieakesson,[],1844,80,140
19
  1194495733858222080,Åtta år senare och Morgan kämpar vidare... 🦸‍♂️ https://t.co/iCWrEwhgHP,2019-11-13 07:02:30,95972673,jimmieakesson,[],1769,87,271
 
 
textclassifier/TextClassifier.py CHANGED
@@ -45,6 +45,13 @@ class TextClassifier:
45
  # self.api_key = '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'
46
  openai.api_key = OPENAI_AUTHTOKEN
47
 
 
 
 
 
 
 
 
48
  @staticmethod
49
  def cleanup_sentiment_results(classification_unclean):
50
  """
@@ -219,7 +226,7 @@ class TextClassifier:
219
 
220
  def df_to_csv(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
221
  """
222
- Writes pandas df to csv file. If it already exists, it appends.
223
  :param filename:
224
  :return:
225
  """
@@ -228,18 +235,32 @@ class TextClassifier:
228
  else:
229
  self.df.to_csv(filename, mode='a', header=False, index=False)
230
 
231
- def return_row_if_ID_exists(self, id: str, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
 
 
 
232
  """
233
- Checks if a ID is already in the Data.csv file and if it is, it returns the row
234
- :param id:
235
- :return:
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  """
237
- with open(filename, 'r') as csvfile:
238
- datareader = csv.reader(csvfile)
239
- for row in datareader:
240
- if row[0] == id:
241
- return row
242
- return None
243
 
244
  def __repr__(self):
245
  """
@@ -248,7 +269,9 @@ class TextClassifier:
248
  """
249
  return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
250
 
251
- # if __name__ == "__main__":
 
 
252
  # import pandas as pd
253
  # from datetime import datetime
254
  # import os
 
45
  # self.api_key = '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'
46
  openai.api_key = OPENAI_AUTHTOKEN
47
 
48
+ def classify_topic_and_sentiment(self):
49
+ self.classify_topic_of_tweets()
50
+ self.classify_sentiment_of_tweets()
51
+
52
+ # save the dataframe to a csv file
53
+
54
+
55
  @staticmethod
56
  def cleanup_sentiment_results(classification_unclean):
57
  """
 
226
 
227
  def df_to_csv(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
228
  """
229
+ Writes pandas df to csv file. If it already exists, it appends. If not, it creates. It also removes duplicates.
230
  :param filename:
231
  :return:
232
  """
 
235
  else:
236
  self.df.to_csv(filename, mode='a', header=False, index=False)
237
 
238
+ self.remove_duplicates_from_csv(filename)
239
+
240
+ @staticmethod
241
+ def remove_duplicates_from_csv(filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
242
  """
243
+ Removes duplicates from csv file.
244
+ :param filename: filename of csv file
245
+ :return: None
246
+ """
247
+ with open(filename, 'r') as f:
248
+ lines = f.readlines()
249
+ with open(filename, 'w') as f:
250
+ for line in lines:
251
+ if line not in lines[lines.index(line) + 1:]:
252
+ f.write(line)
253
+
254
+ def remove_already_classified_tweets(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
255
+ """
256
+ Removes tweets that have already been classified.
257
+ :param filename: filename of csv file
258
+ :return: None
259
  """
260
+ df = self.df
261
+ df = df[df['sentiment'].isnull()]
262
+ self.df = df
263
+ self.df_to_csv(filename)
 
 
264
 
265
  def __repr__(self):
266
  """
 
269
  """
270
  return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
271
 
272
+ if __name__ == "__main__":
273
+ tc = TextClassifier(from_date="2022-01-01", to_date="2022-05-31", user_name='jimmieakesson', num_tweets=20)
274
+ tc.remove_duplicates_from_csv()
275
  # import pandas as pd
276
  # from datetime import datetime
277
  # import os