Spaces:
Runtime error
Runtime error
Merge branch 'develop' into 36-create-process-to-store-and-retrieve-data-for-textclassifier
Browse files- .idea/politweet.iml +0 -1
- README.md +8 -3
- requirements.txt +1 -0
- textclassifier/TextClassifier.py +41 -10
.idea/politweet.iml
CHANGED
@@ -3,7 +3,6 @@
|
|
3 |
<component name="NewModuleRootManager">
|
4 |
<content url="file://$MODULE_DIR$">
|
5 |
<excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
|
6 |
-
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
7 |
</content>
|
8 |
<orderEntry type="inheritedJdk" />
|
9 |
<orderEntry type="sourceFolder" forTests="false" />
|
|
|
3 |
<component name="NewModuleRootManager">
|
4 |
<content url="file://$MODULE_DIR$">
|
5 |
<excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
|
|
|
6 |
</content>
|
7 |
<orderEntry type="inheritedJdk" />
|
8 |
<orderEntry type="sourceFolder" forTests="false" />
|
README.md
CHANGED
@@ -20,9 +20,14 @@ För att få alla dependencies:
|
|
20 |
|
21 |
1. skapa en virtual environment: https://docs.python.org/3/library/venv.html
|
22 |
2. Aktivera din virtual environment
|
23 |
-
|
24 |
-
$
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
|
28 |
|
|
|
20 |
|
21 |
1. skapa en virtual environment: https://docs.python.org/3/library/venv.html
|
22 |
2. Aktivera din virtual environment
|
23 |
+
3. gå till projektets root path och skriv i terminalen:
|
24 |
+
$ pip install -r requirements.txt
|
25 |
+
4. I vissa fall funkar det inte att installera twint för Ubuntu. Efter att ha ställt in allt funkade det efter att ha kört "sudo apt-get install build- essential" i terminalen.
|
26 |
+
5. För att använda openai behövs en auktoriserings-token. Detta skapas genom att skapa en '.env' fil i projektets root path.
|
27 |
+
6. Skriv in följande i den filen:
|
28 |
+
OPENAI_AUTHTOKEN=din open-ai token
|
29 |
+
7. Nu borde TextClassifier kunna använda openai, givet att du har timmar att lägga till din token.
|
30 |
+
|
31 |
|
32 |
|
33 |
|
requirements.txt
CHANGED
@@ -37,6 +37,7 @@ pycparser==2.21
|
|
37 |
pyparsing==3.0.9
|
38 |
PySocks==1.7.1
|
39 |
python-dateutil==2.8.2
|
|
|
40 |
python-socks==2.0.3
|
41 |
pytz==2022.1
|
42 |
regex==2022.6.2
|
|
|
37 |
pyparsing==3.0.9
|
38 |
PySocks==1.7.1
|
39 |
python-dateutil==2.8.2
|
40 |
+
python-dotenv==0.20.0
|
41 |
python-socks==2.0.3
|
42 |
pytz==2022.1
|
43 |
regex==2022.6.2
|
textclassifier/TextClassifier.py
CHANGED
@@ -5,9 +5,17 @@ from twitterscraper import TwitterScraper
|
|
5 |
from datetime import date
|
6 |
import os
|
7 |
|
|
|
8 |
# Set one directory up into ROOT_PATH
|
9 |
ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
class TextClassifier:
|
13 |
def __init__(self, model_name="text-davinci-002", from_date='2022-01-01', to_date=str(date.today()),
|
@@ -21,6 +29,14 @@ class TextClassifier:
|
|
21 |
:param to_date: string of the format 'YYYY-MM-DD'.
|
22 |
:param num_tweets: integer value of the maximum number of tweets to be scraped.
|
23 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
self.model_name = model_name
|
26 |
self.from_date = from_date
|
@@ -30,7 +46,7 @@ class TextClassifier:
|
|
30 |
self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
|
31 |
self.df = self.ts.scrape_by_user(user_name)
|
32 |
# self.api_key = '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'
|
33 |
-
openai.api_key =
|
34 |
|
35 |
def scrape_tweets(self):
|
36 |
"""
|
@@ -51,7 +67,6 @@ class TextClassifier:
|
|
51 |
classification_clean = classification_clean.replace(" ", "")
|
52 |
|
53 |
return classification_clean
|
54 |
-
return response.choices[0]['text']
|
55 |
|
56 |
def classify_sentiment(self, text: str):
|
57 |
"""
|
@@ -205,9 +220,7 @@ class TextClassifier:
|
|
205 |
df_topic['topic'] = df_topic['tweet'].apply(self.classify_topic)
|
206 |
return df_topic
|
207 |
|
208 |
-
|
209 |
-
return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
|
210 |
-
self.num_tweets)
|
211 |
|
212 |
@staticmethod
|
213 |
def cleanup_topic_results(prediction_dict, text):
|
@@ -240,9 +253,27 @@ class TextClassifier:
|
|
240 |
return None
|
241 |
|
242 |
|
243 |
-
|
244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
|
246 |
-
pd.set_option('display.max_columns', None)
|
247 |
-
TC = TextClassifier(from_date="2019-01-01", to_date="2019-12-31", user_name='jimmieakesson', num_tweets=100)
|
248 |
-
TC.df_to_csv()
|
|
|
5 |
from datetime import date
|
6 |
import os
|
7 |
|
8 |
+
|
9 |
# Set one directory up into ROOT_PATH
|
10 |
ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
11 |
|
12 |
+
from dotenv import find_dotenv, load_dotenv
|
13 |
+
|
14 |
+
dotenv_path = find_dotenv()
|
15 |
+
load_dotenv(dotenv_path)
|
16 |
+
OPENAI_AUTHTOKEN = os.environ.get("OPENAI_AUTHTOKEN")
|
17 |
+
|
18 |
+
|
19 |
|
20 |
class TextClassifier:
|
21 |
def __init__(self, model_name="text-davinci-002", from_date='2022-01-01', to_date=str(date.today()),
|
|
|
29 |
:param to_date: string of the format 'YYYY-MM-DD'.
|
30 |
:param num_tweets: integer value of the maximum number of tweets to be scraped.
|
31 |
"""
|
32 |
+
# Make sure to_date is later than from_date
|
33 |
+
assert from_date < to_date, "from_date must be earlier than to_date"
|
34 |
+
# Make sure the dates are in the correct format
|
35 |
+
assert re.match(r'^\d{4}-\d{2}-\d{2}$', from_date) is not None, "from_date must be in the format YYYY-MM-DD"
|
36 |
+
# Make sure user_name is not empty
|
37 |
+
assert user_name is not None, "user_name cannot be empty"
|
38 |
+
# Make sure num_tweets is a positive integer
|
39 |
+
assert num_tweets > 0, "num_tweets must be a positive integer"
|
40 |
|
41 |
self.model_name = model_name
|
42 |
self.from_date = from_date
|
|
|
46 |
self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
|
47 |
self.df = self.ts.scrape_by_user(user_name)
|
48 |
# self.api_key = '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'
|
49 |
+
openai.api_key = OPENAI_AUTHTOKEN
|
50 |
|
51 |
def scrape_tweets(self):
|
52 |
"""
|
|
|
67 |
classification_clean = classification_clean.replace(" ", "")
|
68 |
|
69 |
return classification_clean
|
|
|
70 |
|
71 |
def classify_sentiment(self, text: str):
|
72 |
"""
|
|
|
220 |
df_topic['topic'] = df_topic['tweet'].apply(self.classify_topic)
|
221 |
return df_topic
|
222 |
|
223 |
+
|
|
|
|
|
224 |
|
225 |
@staticmethod
|
226 |
def cleanup_topic_results(prediction_dict, text):
|
|
|
253 |
return None
|
254 |
|
255 |
|
256 |
+
def __repr__(self):
|
257 |
+
"""
|
258 |
+
Gives a string that describes which user is classified
|
259 |
+
:return:
|
260 |
+
"""
|
261 |
+
return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
|
262 |
+
|
263 |
+
|
264 |
+
# if __name__ == "__main__":
|
265 |
+
# import pandas as pd
|
266 |
+
# from datetime import datetime
|
267 |
+
# import os
|
268 |
+
# # show all columns
|
269 |
+
# pd.set_option('display.max_columns', None)
|
270 |
+
#
|
271 |
+
# tc = TextClassifier(from_date="2019-01-01", to_date="2019-05-31", user_name='jimmieakesson', num_tweets=20)
|
272 |
+
# tc.classify_sentiment_of_tweets()
|
273 |
+
# # df = tc.analyze_sentiment_of_tweets()
|
274 |
+
# # print(df)
|
275 |
+
# df = tc.classify_topics_of_tweets()
|
276 |
+
# print(df)
|
277 |
+
# # save to csv in a folder under politweet with timestamp in name
|
278 |
+
# df.to_csv(f"{datetime.now().strftime('%Y-%m-%d %H-%M-%S')}_tweets.csv")
|
279 |
|
|
|
|
|
|