Oresti Theodoridis commited on
Commit
dcbfe5a
2 Parent(s): e65e077 c6bed98

Merge pull request #64 from Demea9000/63-fix-todos-in-twitterscraper

Browse files
textclassifier/TextClassifier.py CHANGED
@@ -29,21 +29,15 @@ class TextClassifier:
29
  :param to_date: string of the format 'YYYY-MM-DD'.
30
  :param num_tweets: integer value of the maximum number of tweets to be scraped.
31
  """
32
- # Make sure to_date is later than from_date
33
- assert from_date < to_date, "from_date must be earlier than to_date"
34
- # Make sure the dates are in the correct format
35
- assert re.match(r'^\d{4}-\d{2}-\d{2}$', from_date) is not None, "from_date must be in the format YYYY-MM-DD"
36
  # Make sure user_name is not empty
37
  assert user_name is not None, "user_name cannot be empty"
38
- # Make sure num_tweets is a positive integer
39
- assert 0 < num_tweets <= 20, "num_tweets must be a positive integer and at most 20"
40
 
 
41
  self.model_name = model_name
42
  self.from_date = from_date
43
  self.to_date = to_date
44
  self.num_tweets = num_tweets
45
  self.user_name = user_name
46
- self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
47
  # Assure that scrape_by_user actually gets num_tweets
48
  # add timer in time-loop and stop after 10 seconds
49
  start_time = time.time()
 
29
  :param to_date: string of the format 'YYYY-MM-DD'.
30
  :param num_tweets: integer value of the maximum number of tweets to be scraped.
31
  """
 
 
 
 
32
  # Make sure user_name is not empty
33
  assert user_name is not None, "user_name cannot be empty"
 
 
34
 
35
+ self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
36
  self.model_name = model_name
37
  self.from_date = from_date
38
  self.to_date = to_date
39
  self.num_tweets = num_tweets
40
  self.user_name = user_name
 
41
  # Assure that scrape_by_user actually gets num_tweets
42
  # add timer in time-loop and stop after 10 seconds
43
  start_time = time.time()
twitterscraper/TwitterScraper.py CHANGED
@@ -1,6 +1,7 @@
1
  import pandas as pd
2
  import twint
3
  from datetime import date
 
4
 
5
 
6
  class TwitterScraper(object):
@@ -13,10 +14,21 @@ class TwitterScraper(object):
13
  """
14
 
15
  def __init__(self, from_date="2022-07-01", to_date=str(date.today()), num_tweets=20):
16
- # TODO: add a check to make sure that the dates are in the correct format.
17
- # TODO: add a check to make sure that the number of tweets is a positive number.
18
- # TODO: add a check to make sure that the number of tweets
19
- # is not greater than the number of tweets in the date range.
 
 
 
 
 
 
 
 
 
 
 
20
  self.from_date = from_date
21
  self.to_date = to_date
22
  self.num_tweets = num_tweets
@@ -48,9 +60,13 @@ class TwitterScraper(object):
48
  self.conf.Search = _string # this tells twint configuration to search for string
49
  return self.__get_tweets__from_twint__()
50
 
51
- # TODO: Possibly include more than one user
52
  def scrape_by_user_and_string(self, _user: str, _string: str):
53
- """This method uses twint to extract tweets based on string and username"""
 
 
 
 
 
54
  self.conf.Username = _user
55
  self.conf.Search = _string
56
  return self.__get_tweets__from_twint__()
@@ -74,7 +90,8 @@ class TwitterScraper(object):
74
  return tweets_info
75
 
76
  def __get_tweets__from_twint__(self):
77
- """ __get_tweets_from_twint__
 
78
  tweet info is a dataframe with fallowing columns
79
  Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
80
  'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
@@ -94,12 +111,12 @@ class TwitterScraper(object):
94
  self.conf.Until = self.to_date
95
  self.conf.Hide_output = True # Hides the output. If set to False it will print tweets in the terminal window.
96
  twint.run.Search(self.conf)
97
- tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output souldwe dataframe.
98
  if tweet_and_replies_inf.empty:
99
  print("No tweet containing the word \"" + self.conf.Search + "\" could be found!")
100
  else:
101
  tweet_and_replies_inf = tweet_and_replies_inf[
102
- ["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
103
  return tweet_and_replies_inf
104
 
105
  # def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
@@ -111,7 +128,6 @@ class TwitterScraper(object):
111
  def __repr__(self):
112
  return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
113
  self.num_tweets)
114
-
115
 
116
 
117
  if __name__ == "__main__":
 
1
  import pandas as pd
2
  import twint
3
  from datetime import date
4
+ import re
5
 
6
 
7
  class TwitterScraper(object):
 
14
  """
15
 
16
  def __init__(self, from_date="2022-07-01", to_date=str(date.today()), num_tweets=20):
17
+ """
18
+ This method initializes the TwitterScraper class. It takes the user as input and collects the user's tweets
19
+ from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of
20
+ tweets 'num_tweets' from today.
21
+ :param from_date: str (format: YYYY-MM-DD)
22
+ :param to_date: str (format: YYYY-MM-DD)
23
+ :param num_tweets: int (number of tweets to be scraped)
24
+ """
25
+ # Make sure the dates are in the correct format
26
+ assert re.match(r'^\d{4}-\d{2}-\d{2}$', from_date) is not None, "from_date must be in the format YYYY-MM-DD"
27
+ # Make sure to_date is later than from_date
28
+ assert from_date < to_date, "from_date must be earlier than to_date"
29
+ # Make sure num_tweets is a positive integer
30
+ assert 0 < num_tweets <= 20, "num_tweets must be a positive integer and at most 20"
31
+
32
  self.from_date = from_date
33
  self.to_date = to_date
34
  self.num_tweets = num_tweets
 
60
  self.conf.Search = _string # this tells twint configuration to search for string
61
  return self.__get_tweets__from_twint__()
62
 
 
63
  def scrape_by_user_and_string(self, _user: str, _string: str):
64
+ """
65
+ This method uses twint to extract tweets based on string and username. It takes a list of users as input.
66
+ :param _user: str
67
+ :param _string: str
68
+ :return: dataframe
69
+ """
70
  self.conf.Username = _user
71
  self.conf.Search = _string
72
  return self.__get_tweets__from_twint__()
 
90
  return tweets_info
91
 
92
  def __get_tweets__from_twint__(self):
93
+ """
94
+ __get_tweets_from_twint__
95
  tweet info is a dataframe with fallowing columns
96
  Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
97
  'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
 
111
  self.conf.Until = self.to_date
112
  self.conf.Hide_output = True # Hides the output. If set to False it will print tweets in the terminal window.
113
  twint.run.Search(self.conf)
114
+ tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output is a dataframe
115
  if tweet_and_replies_inf.empty:
116
  print("No tweet containing the word \"" + self.conf.Search + "\" could be found!")
117
  else:
118
  tweet_and_replies_inf = tweet_and_replies_inf[
119
+ ["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
120
  return tweet_and_replies_inf
121
 
122
  # def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
 
128
  def __repr__(self):
129
  return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
130
  self.num_tweets)
 
131
 
132
 
133
  if __name__ == "__main__":