Oresti Theodoridis commited on
Commit
c3a63c7
2 Parent(s): 44e11ec e7a61fb

Merge branch 'develop' into 36-create-process-to-store-and-retrieve-data-for-textclassifier

Browse files
.idea/politweet.iml CHANGED
@@ -3,7 +3,6 @@
3
  <component name="NewModuleRootManager">
4
  <content url="file://$MODULE_DIR$">
5
  <excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
6
- <excludeFolder url="file://$MODULE_DIR$/venv" />
7
  </content>
8
  <orderEntry type="inheritedJdk" />
9
  <orderEntry type="sourceFolder" forTests="false" />
 
3
  <component name="NewModuleRootManager">
4
  <content url="file://$MODULE_DIR$">
5
  <excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
 
6
  </content>
7
  <orderEntry type="inheritedJdk" />
8
  <orderEntry type="sourceFolder" forTests="false" />
README.md CHANGED
@@ -20,9 +20,14 @@ För att få alla dependencies:
20
 
21
  1. skapa en virtual environment: https://docs.python.org/3/library/venv.html
22
  2. Aktivera din virtual environment
23
- 2. gå till projektets root path och skriv i terminalen:
24
- $ env2/bin/python -m pip install -r requirements.txt
25
- 3. I vissa fall funkar det inte att installera twint för Ubuntu. Efter att ha ställt in allt funkade det efter att ha kört "sudo apt-get install build- essential" i terminalen.
 
 
 
 
 
26
 
27
 
28
 
 
20
 
21
  1. skapa en virtual environment: https://docs.python.org/3/library/venv.html
22
  2. Aktivera din virtual environment
23
+ 3. gå till projektets root path och skriv i terminalen:
24
+ $ pip install -r requirements.txt
25
+ 4. I vissa fall funkar det inte att installera twint för Ubuntu. Efter att ha ställt in allt funkade det efter att ha kört "sudo apt-get install build- essential" i terminalen.
26
+ 5. För att använda openai behövs en auktoriserings-token. Detta skapas genom att skapa en '.env' fil i projektets root path.
27
+ 6. Skriv in följande i den filen:
28
+ OPENAI_AUTHTOKEN=din open-ai token
29
+ 7. Nu borde TextClassifier kunna använda openai, givet att du har timmar att lägga till din token.
30
+
31
 
32
 
33
 
requirements.txt CHANGED
@@ -37,6 +37,7 @@ pycparser==2.21
37
  pyparsing==3.0.9
38
  PySocks==1.7.1
39
  python-dateutil==2.8.2
 
40
  python-socks==2.0.3
41
  pytz==2022.1
42
  regex==2022.6.2
 
37
  pyparsing==3.0.9
38
  PySocks==1.7.1
39
  python-dateutil==2.8.2
40
+ python-dotenv==0.20.0
41
  python-socks==2.0.3
42
  pytz==2022.1
43
  regex==2022.6.2
textclassifier/TextClassifier.py CHANGED
@@ -5,9 +5,17 @@ from twitterscraper import TwitterScraper
5
  from datetime import date
6
  import os
7
 
 
8
  # Set one directory up into ROOT_PATH
9
  ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
10
 
 
 
 
 
 
 
 
11
 
12
  class TextClassifier:
13
  def __init__(self, model_name="text-davinci-002", from_date='2022-01-01', to_date=str(date.today()),
@@ -21,6 +29,14 @@ class TextClassifier:
21
  :param to_date: string of the format 'YYYY-MM-DD'.
22
  :param num_tweets: integer value of the maximum number of tweets to be scraped.
23
  """
 
 
 
 
 
 
 
 
24
 
25
  self.model_name = model_name
26
  self.from_date = from_date
@@ -30,7 +46,7 @@ class TextClassifier:
30
  self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
31
  self.df = self.ts.scrape_by_user(user_name)
32
  # self.api_key = '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'
33
- openai.api_key = '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'
34
 
35
  def scrape_tweets(self):
36
  """
@@ -51,7 +67,6 @@ class TextClassifier:
51
  classification_clean = classification_clean.replace(" ", "")
52
 
53
  return classification_clean
54
- return response.choices[0]['text']
55
 
56
  def classify_sentiment(self, text: str):
57
  """
@@ -205,9 +220,7 @@ class TextClassifier:
205
  df_topic['topic'] = df_topic['tweet'].apply(self.classify_topic)
206
  return df_topic
207
 
208
- def __repr__(self):
209
- return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
210
- self.num_tweets)
211
 
212
  @staticmethod
213
  def cleanup_topic_results(prediction_dict, text):
@@ -240,9 +253,27 @@ class TextClassifier:
240
  return None
241
 
242
 
243
- if __name__ == "__main__":
244
- import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
- pd.set_option('display.max_columns', None)
247
- TC = TextClassifier(from_date="2019-01-01", to_date="2019-12-31", user_name='jimmieakesson', num_tweets=100)
248
- TC.df_to_csv()
 
5
  from datetime import date
6
  import os
7
 
8
+
9
  # Set one directory up into ROOT_PATH
10
  ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
11
 
12
+ from dotenv import find_dotenv, load_dotenv
13
+
14
+ dotenv_path = find_dotenv()
15
+ load_dotenv(dotenv_path)
16
+ OPENAI_AUTHTOKEN = os.environ.get("OPENAI_AUTHTOKEN")
17
+
18
+
19
 
20
  class TextClassifier:
21
  def __init__(self, model_name="text-davinci-002", from_date='2022-01-01', to_date=str(date.today()),
 
29
  :param to_date: string of the format 'YYYY-MM-DD'.
30
  :param num_tweets: integer value of the maximum number of tweets to be scraped.
31
  """
32
+ # Make sure to_date is later than from_date
33
+ assert from_date < to_date, "from_date must be earlier than to_date"
34
+ # Make sure the dates are in the correct format
35
+ assert re.match(r'^\d{4}-\d{2}-\d{2}$', from_date) is not None, "from_date must be in the format YYYY-MM-DD"
36
+ # Make sure user_name is not empty
37
+ assert user_name is not None, "user_name cannot be empty"
38
+ # Make sure num_tweets is a positive integer
39
+ assert num_tweets > 0, "num_tweets must be a positive integer"
40
 
41
  self.model_name = model_name
42
  self.from_date = from_date
 
46
  self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
47
  self.df = self.ts.scrape_by_user(user_name)
48
  # self.api_key = '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'
49
+ openai.api_key = OPENAI_AUTHTOKEN
50
 
51
  def scrape_tweets(self):
52
  """
 
67
  classification_clean = classification_clean.replace(" ", "")
68
 
69
  return classification_clean
 
70
 
71
  def classify_sentiment(self, text: str):
72
  """
 
220
  df_topic['topic'] = df_topic['tweet'].apply(self.classify_topic)
221
  return df_topic
222
 
223
+
 
 
224
 
225
  @staticmethod
226
  def cleanup_topic_results(prediction_dict, text):
 
253
  return None
254
 
255
 
256
+ def __repr__(self):
257
+ """
258
+ Gives a string that describes which user is classified
259
+ :return:
260
+ """
261
+ return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
262
+
263
+
264
+ # if __name__ == "__main__":
265
+ # import pandas as pd
266
+ # from datetime import datetime
267
+ # import os
268
+ # # show all columns
269
+ # pd.set_option('display.max_columns', None)
270
+ #
271
+ # tc = TextClassifier(from_date="2019-01-01", to_date="2019-05-31", user_name='jimmieakesson', num_tweets=20)
272
+ # tc.classify_sentiment_of_tweets()
273
+ # # df = tc.analyze_sentiment_of_tweets()
274
+ # # print(df)
275
+ # df = tc.classify_topics_of_tweets()
276
+ # print(df)
277
+ # # save to csv in a folder under politweet with timestamp in name
278
+ # df.to_csv(f"{datetime.now().strftime('%Y-%m-%d %H-%M-%S')}_tweets.csv")
279