olofbengtsson commited on
Commit
58a2a29
1 Parent(s): f8ad876

More tests for scraper_test.py and handling of scrapes with no results in TwitterScraper.py

Browse files
tests/scraper_test.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import unittest
2
 
3
  import pandas as pd
@@ -18,8 +19,14 @@ class MyTestCase(unittest.TestCase):
18
  to_date = "2022-07-31"
19
  user = 'jimmieakesson'
20
  user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM', 'dadgostarnooshi']
21
- sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
22
- cls._df_uni = sc.scrape_by_user(user)
 
 
 
 
 
 
23
  nbr_of_cols = 9
24
 
25
  def setUp(self):
@@ -30,33 +37,52 @@ class MyTestCase(unittest.TestCase):
30
  self.user = 'jimmieakesson'
31
  self.user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
32
  'dadgostarnooshi']
 
33
  # self.sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
34
  nbr_of_cols = 9
35
 
36
  # Checks that the returned datatype is pandas DataFrame
37
  def test_correct_type(self):
38
  print('Checking type...')
39
- self.assertEqual(type(self._df_uni), type(pd.DataFrame())) # add assertion here
 
 
40
 
41
  # Checks that we get the correct number of tweets
42
- # OBS, FOR NOW IT ONLY CHECKS THAT WE DON'T OVERSAMPLE
43
  def test_correct_nbr_tweets(self):
44
  print('Checking number of tweets...')
45
- self.assertTrue(self._df_uni.shape[0] < self.num_tweets)
 
 
46
 
47
  # Checks that all dates are between the start date and the end date
48
  def test_dates(self):
49
  print('Checking dates...')
50
  d_start = datetime.fromisoformat(self.from_date)
51
  d_end = datetime.fromisoformat(self.to_date)
52
- correct_date = True
 
 
53
  for date in self._df_uni.date:
54
  d = datetime.fromisoformat(date)
55
  if not (d >= d_start and d <= d_end):
56
- correct_date = False
 
 
 
 
 
 
 
 
 
 
57
  break
58
 
59
- self.assertTrue(correct_date)
 
 
60
 
61
  # Checks that all tweets are from the correct user
62
  def test_user(self):
@@ -83,23 +109,72 @@ class MyTestCase(unittest.TestCase):
83
  # Checks if there are tweets that have been sampled several times
84
  def test_no_doubles(self):
85
  print('Checking doubles...')
86
- id_set = set(self._df_uni.id)
87
- self.assertTrue(len(id_set) == self._df_uni.shape[0])
 
 
 
 
88
 
89
  # Checks that we have no None entries
90
  def test_none(self):
91
  print('Checking Nones...')
92
  self.assertFalse(any(b == True for b in self._df_uni.isnull()))
 
 
93
 
94
  def test_no_url_tweets(self):
95
  print('Checking url tweets...')
96
- only_url = False
 
 
97
  for tweet in self._df_uni.tweet:
98
  if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
99
  print(tweet.split())
100
- only_url = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  break
102
- self.assertFalse(only_url)
 
 
 
 
 
 
 
 
 
 
103
 
104
 
105
 
 
1
+ import re
2
  import unittest
3
 
4
  import pandas as pd
 
19
  to_date = "2022-07-31"
20
  user = 'jimmieakesson'
21
  user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM', 'dadgostarnooshi']
22
+ sc1 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
23
+ sc2 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
24
+ sc3 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
25
+ search_string = 'miljö'
26
+ cls._df_uni = sc1.scrape_by_user(user)
27
+ cls._df_poly = sc2.scrape_by_several_users(user_list)
28
+ cls._df_by_string = sc3.scrape_by_string(search_string)
29
+
30
  nbr_of_cols = 9
31
 
32
  def setUp(self):
 
37
  self.user = 'jimmieakesson'
38
  self.user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
39
  'dadgostarnooshi']
40
+ self.search_string = 'miljö'
41
  # self.sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
42
  nbr_of_cols = 9
43
 
44
  # Checks that the returned datatype is pandas DataFrame
45
  def test_correct_type(self):
46
  print('Checking type...')
47
+ self.assertEqual(type(self._df_uni), type(pd.DataFrame()))
48
+ self.assertEqual(type(self._df_poly), type(pd.DataFrame()))
49
+ self.assertEqual(type(self._df_by_string), type(pd.DataFrame()))
50
 
51
  # Checks that we get the correct number of tweets
52
+ # OBS, FOR NOW IT ONLY CHECKS THAT WE DON'T OVER SAMPLE
53
  def test_correct_nbr_tweets(self):
54
  print('Checking number of tweets...')
55
+ self.assertTrue(self._df_uni.shape[0] <= self.num_tweets)
56
+ self.assertTrue(self._df_poly.shape[0] <= self.num_tweets)
57
+ self.assertTrue(self._df_by_string.shape[0] <= self.num_tweets)
58
 
59
  # Checks that all dates are between the start date and the end date
60
  def test_dates(self):
61
  print('Checking dates...')
62
  d_start = datetime.fromisoformat(self.from_date)
63
  d_end = datetime.fromisoformat(self.to_date)
64
+ correct_date_uni = True
65
+ correct_date_poly = True
66
+ correct_date_by_string = True
67
  for date in self._df_uni.date:
68
  d = datetime.fromisoformat(date)
69
  if not (d >= d_start and d <= d_end):
70
+ correct_date_uni = False
71
+ break
72
+ for date in self._df_poly.date:
73
+ d = datetime.fromisoformat(date)
74
+ if not (d >= d_start and d <= d_end):
75
+ correct_date_poly = False
76
+ break
77
+ for date in self._df_by_string.date:
78
+ d = datetime.fromisoformat(date)
79
+ if not (d >= d_start and d <= d_end):
80
+ correct_date_by_string = False
81
  break
82
 
83
+ self.assertTrue(correct_date_uni)
84
+ self.assertTrue(correct_date_poly)
85
+ self.assertTrue(correct_date_by_string)
86
 
87
  # Checks that all tweets are from the correct user
88
  def test_user(self):
 
109
  # Checks if there are tweets that have been sampled several times
110
  def test_no_doubles(self):
111
  print('Checking doubles...')
112
+ id_set_uni = set(self._df_uni.id)
113
+ id_set_poly = set(self._df_poly.id)
114
+ id_set_by_string = set(self._df_by_string.id)
115
+ self.assertTrue(len(id_set_uni) == self._df_uni.shape[0])
116
+ self.assertTrue(len(id_set_poly) == self._df_poly.shape[0])
117
+ self.assertTrue(len(id_set_by_string) == self._df_by_string.shape[0])
118
 
119
  # Checks that we have no None entries
120
  def test_none(self):
121
  print('Checking Nones...')
122
  self.assertFalse(any(b == True for b in self._df_uni.isnull()))
123
+ self.assertFalse(any(b == True for b in self._df_poly.isnull()))
124
+ self.assertFalse(any(b == True for b in self._df_by_string.isnull()))
125
 
126
  def test_no_url_tweets(self):
127
  print('Checking url tweets...')
128
+ only_url_uni = False
129
+ only_url_poly = False
130
+ only_url_by_string = False
131
  for tweet in self._df_uni.tweet:
132
  if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
133
  print(tweet.split())
134
+ only_url_uni = True
135
+ break
136
+ for tweet in self._df_poly.tweet:
137
+ if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
138
+ print(tweet.split())
139
+ only_url_poly = True
140
+ break
141
+ for tweet in self._df_by_string.tweet:
142
+ if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
143
+ print(tweet.split())
144
+ only_url_by_string = True
145
+ break
146
+ self.assertFalse(only_url_uni)
147
+ self.assertFalse(only_url_poly)
148
+ self.assertFalse(only_url_by_string)
149
+
150
+ def test_many_users(self):
151
+ correct_users = True
152
+ for user in self._df_uni.username:
153
+ if user not in self.user_list:
154
+ correct_users = False
155
+ break
156
+ self.assertTrue(correct_users)
157
+
158
+ def test_many_user_ids(self):
159
+ correct_ids = True
160
+ user_dict = {}
161
+ for i in range(self._df_uni.shape[0]):
162
+ if self._df_uni['username'][i] not in user_dict:
163
+ user_dict[self._df_uni['username'][i]] = self._df_uni['user_id'][i]
164
+ if self._df_uni['user_id'][i] != user_dict[self._df_uni['username'][i]]:
165
+ correct_ids = False
166
  break
167
+ self.assertTrue(correct_ids)
168
+
169
+ def test_string_search(self):
170
+ correct_search = True
171
+ search = re.sub('ö', 'ø', self.search_string)
172
+ search = re.sub('ä', 'æ', search)
173
+ search_strings = [self.search_string, search]
174
+ for tweet in self._df_by_string.tweet:
175
+ if all(search not in tweet.lower() for search in search_strings):
176
+ correct_search = False
177
+ self.assertTrue(correct_search)
178
 
179
 
180
 
twitterscraper/TwitterScraper.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import twint
2
  from datetime import date
3
 
@@ -20,7 +21,7 @@ class TwitterScraper(object):
20
  self.num_tweets = num_tweets
21
  self.conf = twint.Config()
22
 
23
- def scrape_by_user(self, _user):
24
  """This method uses twint to extract tweets based on username"""
25
  self.conf.Search = "from:@" + _user # is the search configuration is given in this format it searches after
26
  # user_names.
@@ -46,13 +47,14 @@ class TwitterScraper(object):
46
  self.conf.Search = _string # this tells twint configuration to search for string
47
  return self.__get_tweets__from_twint__()
48
 
 
49
  def scrape_by_user_and_string(self, _user: str, _string: str):
50
- """This method uses twint to extract tweets brased on string and username"""
51
  self.conf.Username = _user
52
  self.conf.Search = _string
53
  return self.__get_tweets__from_twint__()
54
 
55
- # TODO: make method static
56
  def get_only_tweets(self, tweet_and_replies_info):
57
  tweet_and_replies = tweet_and_replies_info["tweet"]
58
  """
@@ -89,12 +91,16 @@ class TwitterScraper(object):
89
  self.conf.Limit = self.num_tweets # specifies how many tweet should be scraped
90
  self.conf.Since = self.from_date
91
  self.conf.Until = self.to_date
92
- self.conf.Hide_output = True # Hides the output. If set to False it will prints tweets in the terminal window.
93
  twint.run.Search(self.conf)
94
  tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output souldwe dataframe.
95
- tweet_and_replies_inf = tweet_and_replies_inf[
96
- ["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
 
 
 
97
  return tweet_and_replies_inf
 
98
  # def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
99
  # is not string it generates exception print("[!] Please make sure the date is a string in this format
100
  # \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
@@ -105,9 +111,18 @@ class TwitterScraper(object):
105
  return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
106
  self.num_tweets)
107
  if __name__ == "__main__":
 
108
  sc = TwitterScraper(from_date="2022-05-01", to_date="2022-07-31", num_tweets=40)
109
- dc = sc.scrape_by_user("jimmieakesson")
110
- print(dc.head())
111
- print(dc.shape)
 
 
 
 
 
 
 
 
112
 
113
 
 
1
+ import pandas as pd
2
  import twint
3
  from datetime import date
4
 
 
21
  self.num_tweets = num_tweets
22
  self.conf = twint.Config()
23
 
24
+ def scrape_by_user(self, _user: str):
25
  """This method uses twint to extract tweets based on username"""
26
  self.conf.Search = "from:@" + _user # is the search configuration is given in this format it searches after
27
  # user_names.
 
47
  self.conf.Search = _string # this tells twint configuration to search for string
48
  return self.__get_tweets__from_twint__()
49
 
50
+ # TODO: Possibly include more than one user
51
  def scrape_by_user_and_string(self, _user: str, _string: str):
52
+ """This method uses twint to extract tweets based on string and username"""
53
  self.conf.Username = _user
54
  self.conf.Search = _string
55
  return self.__get_tweets__from_twint__()
56
 
57
+ # TODO: make method static (Possibly remove this)
58
  def get_only_tweets(self, tweet_and_replies_info):
59
  tweet_and_replies = tweet_and_replies_info["tweet"]
60
  """
 
91
  self.conf.Limit = self.num_tweets # specifies how many tweet should be scraped
92
  self.conf.Since = self.from_date
93
  self.conf.Until = self.to_date
94
+ self.conf.Hide_output = True # Hides the output. If set to False it will print tweets in the terminal window.
95
  twint.run.Search(self.conf)
96
  tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output souldwe dataframe.
97
+ if tweet_and_replies_inf.empty:
98
+ print("No tweet containing the word \"" + self.conf.Search + "\" could be found!")
99
+ else:
100
+ tweet_and_replies_inf = tweet_and_replies_inf[
101
+ ["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
102
  return tweet_and_replies_inf
103
+
104
  # def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
105
  # is not string it generates exception print("[!] Please make sure the date is a string in this format
106
  # \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
 
111
  return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
112
  self.num_tweets)
113
  if __name__ == "__main__":
114
+ user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM', 'dadgostarnooshi']
115
  sc = TwitterScraper(from_date="2022-05-01", to_date="2022-07-31", num_tweets=40)
116
+ dc1 = sc.scrape_by_user("jimmieakesson")
117
+ dc2 = sc.scrape_by_user_and_string("jimmieakesson", "pension")
118
+ dc3 = sc.scrape_by_several_users(user_list)
119
+ print(dc2)
120
+ print(dc1.head())
121
+ print(dc3.head)
122
+
123
+
124
+
125
+
126
+
127
 
128