Spaces:
Runtime error
Runtime error
Bugfixing
Browse files- .idea/csv-plugin.xml +16 -0
- .idea/misc.xml +1 -1
- .idea/politweet.iml +1 -0
- textclassifier/TextClassifier.py +38 -18
.idea/csv-plugin.xml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="CsvFileAttributes">
|
4 |
+
<option name="attributeMap">
|
5 |
+
<map>
|
6 |
+
<entry key="/data/twitterdata.csv">
|
7 |
+
<value>
|
8 |
+
<Attribute>
|
9 |
+
<option name="separator" value="," />
|
10 |
+
</Attribute>
|
11 |
+
</value>
|
12 |
+
</entry>
|
13 |
+
</map>
|
14 |
+
</option>
|
15 |
+
</component>
|
16 |
+
</project>
|
.idea/misc.xml
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<project version="4">
|
3 |
-
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.
|
4 |
</project>
|
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<project version="4">
|
3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (politweet)" project-jdk-type="Python SDK" />
|
4 |
</project>
|
.idea/politweet.iml
CHANGED
@@ -3,6 +3,7 @@
|
|
3 |
<component name="NewModuleRootManager">
|
4 |
<content url="file://$MODULE_DIR$">
|
5 |
<excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
|
|
|
6 |
</content>
|
7 |
<orderEntry type="inheritedJdk" />
|
8 |
<orderEntry type="sourceFolder" forTests="false" />
|
|
|
3 |
<component name="NewModuleRootManager">
|
4 |
<content url="file://$MODULE_DIR$">
|
5 |
<excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
|
6 |
+
<excludeFolder url="file://$MODULE_DIR$/env" />
|
7 |
</content>
|
8 |
<orderEntry type="inheritedJdk" />
|
9 |
<orderEntry type="sourceFolder" forTests="false" />
|
textclassifier/TextClassifier.py
CHANGED
@@ -5,6 +5,7 @@ from twitterscraper import TwitterScraper
|
|
5 |
from datetime import date
|
6 |
import os
|
7 |
from dotenv import find_dotenv, load_dotenv
|
|
|
8 |
|
9 |
# Set one directory up into ROOT_PATH
|
10 |
ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
@@ -51,7 +52,6 @@ class TextClassifier:
|
|
51 |
|
52 |
# save the dataframe to a csv file
|
53 |
|
54 |
-
|
55 |
@staticmethod
|
56 |
def cleanup_sentiment_results(classification_unclean):
|
57 |
"""
|
@@ -219,7 +219,7 @@ class TextClassifier:
|
|
219 |
return df_topic
|
220 |
|
221 |
@staticmethod
|
222 |
-
def cleanup_topic_results(
|
223 |
new_item = text.replace("\n", " ")
|
224 |
new_item = new_item.replace(" ", " ")
|
225 |
return new_item
|
@@ -262,6 +262,35 @@ class TextClassifier:
|
|
262 |
self.df = df
|
263 |
self.df_to_csv(filename)
|
264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
def __repr__(self):
|
266 |
"""
|
267 |
Gives a string that describes which user is classified
|
@@ -269,20 +298,11 @@ class TextClassifier:
|
|
269 |
"""
|
270 |
return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
|
271 |
|
|
|
|
|
272 |
if __name__ == "__main__":
|
273 |
-
tc = TextClassifier(from_date="
|
274 |
-
tc.
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
# # show all columns
|
279 |
-
# pd.set_option('display.max_columns', None)
|
280 |
-
#
|
281 |
-
# tc = TextClassifier(from_date="2019-01-01", to_date="2019-05-31", user_name='jimmieakesson', num_tweets=20)
|
282 |
-
# tc.classify_sentiment_of_tweets()
|
283 |
-
# # df = tc.analyze_sentiment_of_tweets()
|
284 |
-
# # print(df)
|
285 |
-
# df = tc.classify_topics_of_tweets()
|
286 |
-
# print(df)
|
287 |
-
# # save to csv in a folder under politweet with timestamp in name
|
288 |
-
# df.to_csv(f"{datetime.now().strftime('%Y-%m-%d %H-%M-%S')}_tweets.csv")
|
|
|
5 |
from datetime import date
|
6 |
import os
|
7 |
from dotenv import find_dotenv, load_dotenv
|
8 |
+
import pandas as pd
|
9 |
|
10 |
# Set one directory up into ROOT_PATH
|
11 |
ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
|
52 |
|
53 |
# save the dataframe to a csv file
|
54 |
|
|
|
55 |
@staticmethod
|
56 |
def cleanup_sentiment_results(classification_unclean):
|
57 |
"""
|
|
|
219 |
return df_topic
|
220 |
|
221 |
@staticmethod
|
222 |
+
def cleanup_topic_results(text):
|
223 |
new_item = text.replace("\n", " ")
|
224 |
new_item = new_item.replace(" ", " ")
|
225 |
return new_item
|
|
|
262 |
self.df = df
|
263 |
self.df_to_csv(filename)
|
264 |
|
265 |
+
def get_tweet_by_id(self, id, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
|
266 |
+
"""
|
267 |
+
Returns tweet by id.
|
268 |
+
:param id: id of tweet
|
269 |
+
:return: tweet
|
270 |
+
"""
|
271 |
+
|
272 |
+
def run_main_pipeline(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
|
273 |
+
"""
|
274 |
+
Classifies the topics/sentiments of a user's tweets.
|
275 |
+
#We presume that all tweets inside the twitterdata.csv file are already classified.
|
276 |
+
:return: Pandas dataframe
|
277 |
+
"""
|
278 |
+
temp_df = pd.DataFrame(
|
279 |
+
columns=['id', 'tweet', 'date', 'user_id', 'username', 'urls', 'nlikes', 'nreplies', 'nretweets'])
|
280 |
+
already_classified_df = pd.read_csv(filename)
|
281 |
+
|
282 |
+
for index, row in self.df.iterrows():
|
283 |
+
if row['id'] in already_classified_df['id'].values:
|
284 |
+
temp_df = temp_df.append(self.get_tweet_by_id(row['id']))
|
285 |
+
self.df = self.df.drop(index)
|
286 |
+
|
287 |
+
self.df = self.classify_topics_of_tweets()
|
288 |
+
self.df = self.classify_sentiment_of_tweets()
|
289 |
+
# self.df = self.analyze_sentiment_of_tweets()
|
290 |
+
self.df_to_csv(filename)
|
291 |
+
self.df = self.df.append(temp_df)
|
292 |
+
return self.df
|
293 |
+
|
294 |
def __repr__(self):
|
295 |
"""
|
296 |
Gives a string that describes which user is classified
|
|
|
298 |
"""
|
299 |
return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
|
300 |
|
301 |
+
|
302 |
+
|
303 |
if __name__ == "__main__":
|
304 |
+
tc = TextClassifier(from_date="2020-01-01", to_date="2020-12-31", user_name='jimmieakesson', num_tweets=20)
|
305 |
+
tc.test()
|
306 |
+
|
307 |
+
|
308 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|