Demea9000 commited on
Commit
1903058
1 Parent(s): fe688af

ändringar i run_main_pipeline

Browse files
.idea/misc.xml CHANGED
@@ -1,4 +1,4 @@
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
3
- <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (politweet)" project-jdk-type="Python SDK" />
4
  </project>
 
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (politweet)" project-jdk-type="Python SDK" />
4
  </project>
.idea/politweet.iml CHANGED
@@ -5,7 +5,7 @@
5
  <excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
6
  <excludeFolder url="file://$MODULE_DIR$/env" />
7
  </content>
8
- <orderEntry type="inheritedJdk" />
9
  <orderEntry type="sourceFolder" forTests="false" />
10
  </component>
11
  <component name="PyNamespacePackagesService">
 
5
  <excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
6
  <excludeFolder url="file://$MODULE_DIR$/env" />
7
  </content>
8
+ <orderEntry type="jdk" jdkName="Python 3.9 (politweet)" jdkType="Python SDK" />
9
  <orderEntry type="sourceFolder" forTests="false" />
10
  </component>
11
  <component name="PyNamespacePackagesService">
textclassifier/TextClassifier.py CHANGED
@@ -275,21 +275,22 @@ class TextClassifier:
275
  #We presume that all tweets inside the twitterdata.csv file are already classified.
276
  :return: Pandas dataframe
277
  """
278
- temp_df = pd.DataFrame(
279
- columns=['id', 'tweet', 'date', 'user_id', 'username', 'urls', 'nlikes', 'nreplies', 'nretweets'])
280
- already_classified_df = pd.read_csv(filename)
281
-
282
- for index, row in self.df.iterrows():
283
- if row['id'] in already_classified_df['id'].values:
284
- temp_df = temp_df.append(self.get_tweet_by_id(row['id']))
285
- self.df = self.df.drop(index)
 
 
286
 
287
  self.df = self.classify_topics_of_tweets()
288
  self.df = self.classify_sentiment_of_tweets()
289
  # self.df = self.analyze_sentiment_of_tweets()
 
290
  self.df_to_csv(filename)
291
- self.df = self.df.append(temp_df)
292
- return self.df
293
 
294
  def __repr__(self):
295
  """
@@ -299,10 +300,6 @@ class TextClassifier:
299
  return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
300
 
301
 
302
-
303
  if __name__ == "__main__":
304
- tc = TextClassifier(from_date="2020-01-01", to_date="2020-12-31", user_name='jimmieakesson', num_tweets=20)
305
- tc.test()
306
-
307
-
308
-
 
275
  #We presume that all tweets inside the twitterdata.csv file are already classified.
276
  :return: Pandas dataframe
277
  """
278
+ # temp_df = pd.DataFrame(
279
+ # columns=['id', 'tweet', 'date', 'user_id', 'username', 'urls', 'nlikes', 'nreplies', 'nretweets'])
280
+ if os.path.exists(filename):
281
+ already_classified_df = pd.read_csv(filename)
282
+ # Remove all entries in self.df that are already in already_classified_df
283
+ self.df = self.df.merge(already_classified_df, how='left', on='id')
284
+ # Create a new dataframe where entries in already_classified_df but not in self.df
285
+ temp_df = already_classified_df.merge(self.df, how='left', on='id')
286
+ else:
287
+ print("No csv file found. Continuing without removing already classified tweets.")
288
 
289
  self.df = self.classify_topics_of_tweets()
290
  self.df = self.classify_sentiment_of_tweets()
291
  # self.df = self.analyze_sentiment_of_tweets()
292
+
293
  self.df_to_csv(filename)
 
 
294
 
295
  def __repr__(self):
296
  """
 
300
  return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
301
 
302
 
 
303
  if __name__ == "__main__":
304
+ tc = TextClassifier(from_date="2019-12-01", to_date="2020-12-31", user_name='jimmieakesson', num_tweets=100)
305
+ tc.run_main_pipeline()