Spaces:
Runtime error
Runtime error
olofbengtsson
commited on
Commit
•
58a2a29
1
Parent(s):
f8ad876
More tests for scraper_test.py and handling of scrapes with no results in TwitterScraper.py
Browse files- tests/scraper_test.py +88 -13
- twitterscraper/TwitterScraper.py +24 -9
tests/scraper_test.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import unittest
|
2 |
|
3 |
import pandas as pd
|
@@ -18,8 +19,14 @@ class MyTestCase(unittest.TestCase):
|
|
18 |
to_date = "2022-07-31"
|
19 |
user = 'jimmieakesson'
|
20 |
user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM', 'dadgostarnooshi']
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
nbr_of_cols = 9
|
24 |
|
25 |
def setUp(self):
|
@@ -30,33 +37,52 @@ class MyTestCase(unittest.TestCase):
|
|
30 |
self.user = 'jimmieakesson'
|
31 |
self.user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
|
32 |
'dadgostarnooshi']
|
|
|
33 |
# self.sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
|
34 |
nbr_of_cols = 9
|
35 |
|
36 |
# Checks that the returned datatype is pandas DataFrame
|
37 |
def test_correct_type(self):
|
38 |
print('Checking type...')
|
39 |
-
self.assertEqual(type(self._df_uni), type(pd.DataFrame()))
|
|
|
|
|
40 |
|
41 |
# Checks that we get the correct number of tweets
|
42 |
-
# OBS, FOR NOW IT ONLY CHECKS THAT WE DON'T
|
43 |
def test_correct_nbr_tweets(self):
|
44 |
print('Checking number of tweets...')
|
45 |
-
self.assertTrue(self._df_uni.shape[0]
|
|
|
|
|
46 |
|
47 |
# Checks that all dates are between the start date and the end date
|
48 |
def test_dates(self):
|
49 |
print('Checking dates...')
|
50 |
d_start = datetime.fromisoformat(self.from_date)
|
51 |
d_end = datetime.fromisoformat(self.to_date)
|
52 |
-
|
|
|
|
|
53 |
for date in self._df_uni.date:
|
54 |
d = datetime.fromisoformat(date)
|
55 |
if not (d >= d_start and d <= d_end):
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
break
|
58 |
|
59 |
-
self.assertTrue(
|
|
|
|
|
60 |
|
61 |
# Checks that all tweets are from the correct user
|
62 |
def test_user(self):
|
@@ -83,23 +109,72 @@ class MyTestCase(unittest.TestCase):
|
|
83 |
# Checks if there are tweets that have been sampled several times
|
84 |
def test_no_doubles(self):
|
85 |
print('Checking doubles...')
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
88 |
|
89 |
# Checks that we have no None entries
|
90 |
def test_none(self):
|
91 |
print('Checking Nones...')
|
92 |
self.assertFalse(any(b == True for b in self._df_uni.isnull()))
|
|
|
|
|
93 |
|
94 |
def test_no_url_tweets(self):
|
95 |
print('Checking url tweets...')
|
96 |
-
|
|
|
|
|
97 |
for tweet in self._df_uni.tweet:
|
98 |
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
|
99 |
print(tweet.split())
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
break
|
102 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
|
105 |
|
|
|
1 |
+
import re
|
2 |
import unittest
|
3 |
|
4 |
import pandas as pd
|
|
|
19 |
to_date = "2022-07-31"
|
20 |
user = 'jimmieakesson'
|
21 |
user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM', 'dadgostarnooshi']
|
22 |
+
sc1 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
|
23 |
+
sc2 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
|
24 |
+
sc3 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
|
25 |
+
search_string = 'miljö'
|
26 |
+
cls._df_uni = sc1.scrape_by_user(user)
|
27 |
+
cls._df_poly = sc2.scrape_by_several_users(user_list)
|
28 |
+
cls._df_by_string = sc3.scrape_by_string(search_string)
|
29 |
+
|
30 |
nbr_of_cols = 9
|
31 |
|
32 |
def setUp(self):
|
|
|
37 |
self.user = 'jimmieakesson'
|
38 |
self.user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
|
39 |
'dadgostarnooshi']
|
40 |
+
self.search_string = 'miljö'
|
41 |
# self.sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
|
42 |
nbr_of_cols = 9
|
43 |
|
44 |
# Checks that the returned datatype is pandas DataFrame
|
45 |
def test_correct_type(self):
|
46 |
print('Checking type...')
|
47 |
+
self.assertEqual(type(self._df_uni), type(pd.DataFrame()))
|
48 |
+
self.assertEqual(type(self._df_poly), type(pd.DataFrame()))
|
49 |
+
self.assertEqual(type(self._df_by_string), type(pd.DataFrame()))
|
50 |
|
51 |
# Checks that we get the correct number of tweets
|
52 |
+
# OBS, FOR NOW IT ONLY CHECKS THAT WE DON'T OVER SAMPLE
|
53 |
def test_correct_nbr_tweets(self):
|
54 |
print('Checking number of tweets...')
|
55 |
+
self.assertTrue(self._df_uni.shape[0] <= self.num_tweets)
|
56 |
+
self.assertTrue(self._df_poly.shape[0] <= self.num_tweets)
|
57 |
+
self.assertTrue(self._df_by_string.shape[0] <= self.num_tweets)
|
58 |
|
59 |
# Checks that all dates are between the start date and the end date
|
60 |
def test_dates(self):
|
61 |
print('Checking dates...')
|
62 |
d_start = datetime.fromisoformat(self.from_date)
|
63 |
d_end = datetime.fromisoformat(self.to_date)
|
64 |
+
correct_date_uni = True
|
65 |
+
correct_date_poly = True
|
66 |
+
correct_date_by_string = True
|
67 |
for date in self._df_uni.date:
|
68 |
d = datetime.fromisoformat(date)
|
69 |
if not (d >= d_start and d <= d_end):
|
70 |
+
correct_date_uni = False
|
71 |
+
break
|
72 |
+
for date in self._df_poly.date:
|
73 |
+
d = datetime.fromisoformat(date)
|
74 |
+
if not (d >= d_start and d <= d_end):
|
75 |
+
correct_date_poly = False
|
76 |
+
break
|
77 |
+
for date in self._df_by_string.date:
|
78 |
+
d = datetime.fromisoformat(date)
|
79 |
+
if not (d >= d_start and d <= d_end):
|
80 |
+
correct_date_by_string = False
|
81 |
break
|
82 |
|
83 |
+
self.assertTrue(correct_date_uni)
|
84 |
+
self.assertTrue(correct_date_poly)
|
85 |
+
self.assertTrue(correct_date_by_string)
|
86 |
|
87 |
# Checks that all tweets are from the correct user
|
88 |
def test_user(self):
|
|
|
109 |
# Checks if there are tweets that have been sampled several times
|
110 |
def test_no_doubles(self):
|
111 |
print('Checking doubles...')
|
112 |
+
id_set_uni = set(self._df_uni.id)
|
113 |
+
id_set_poly = set(self._df_poly.id)
|
114 |
+
id_set_by_string = set(self._df_by_string.id)
|
115 |
+
self.assertTrue(len(id_set_uni) == self._df_uni.shape[0])
|
116 |
+
self.assertTrue(len(id_set_poly) == self._df_poly.shape[0])
|
117 |
+
self.assertTrue(len(id_set_by_string) == self._df_by_string.shape[0])
|
118 |
|
119 |
# Checks that we have no None entries
|
120 |
def test_none(self):
|
121 |
print('Checking Nones...')
|
122 |
self.assertFalse(any(b == True for b in self._df_uni.isnull()))
|
123 |
+
self.assertFalse(any(b == True for b in self._df_poly.isnull()))
|
124 |
+
self.assertFalse(any(b == True for b in self._df_by_string.isnull()))
|
125 |
|
126 |
def test_no_url_tweets(self):
|
127 |
print('Checking url tweets...')
|
128 |
+
only_url_uni = False
|
129 |
+
only_url_poly = False
|
130 |
+
only_url_by_string = False
|
131 |
for tweet in self._df_uni.tweet:
|
132 |
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
|
133 |
print(tweet.split())
|
134 |
+
only_url_uni = True
|
135 |
+
break
|
136 |
+
for tweet in self._df_poly.tweet:
|
137 |
+
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
|
138 |
+
print(tweet.split())
|
139 |
+
only_url_poly = True
|
140 |
+
break
|
141 |
+
for tweet in self._df_by_string.tweet:
|
142 |
+
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
|
143 |
+
print(tweet.split())
|
144 |
+
only_url_by_string = True
|
145 |
+
break
|
146 |
+
self.assertFalse(only_url_uni)
|
147 |
+
self.assertFalse(only_url_poly)
|
148 |
+
self.assertFalse(only_url_by_string)
|
149 |
+
|
150 |
+
def test_many_users(self):
|
151 |
+
correct_users = True
|
152 |
+
for user in self._df_uni.username:
|
153 |
+
if user not in self.user_list:
|
154 |
+
correct_users = False
|
155 |
+
break
|
156 |
+
self.assertTrue(correct_users)
|
157 |
+
|
158 |
+
def test_many_user_ids(self):
|
159 |
+
correct_ids = True
|
160 |
+
user_dict = {}
|
161 |
+
for i in range(self._df_uni.shape[0]):
|
162 |
+
if self._df_uni['username'][i] not in user_dict:
|
163 |
+
user_dict[self._df_uni['username'][i]] = self._df_uni['user_id'][i]
|
164 |
+
if self._df_uni['user_id'][i] != user_dict[self._df_uni['username'][i]]:
|
165 |
+
correct_ids = False
|
166 |
break
|
167 |
+
self.assertTrue(correct_ids)
|
168 |
+
|
169 |
+
def test_string_search(self):
|
170 |
+
correct_search = True
|
171 |
+
search = re.sub('ö', 'ø', self.search_string)
|
172 |
+
search = re.sub('ä', 'æ', search)
|
173 |
+
search_strings = [self.search_string, search]
|
174 |
+
for tweet in self._df_by_string.tweet:
|
175 |
+
if all(search not in tweet.lower() for search in search_strings):
|
176 |
+
correct_search = False
|
177 |
+
self.assertTrue(correct_search)
|
178 |
|
179 |
|
180 |
|
twitterscraper/TwitterScraper.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import twint
|
2 |
from datetime import date
|
3 |
|
@@ -20,7 +21,7 @@ class TwitterScraper(object):
|
|
20 |
self.num_tweets = num_tweets
|
21 |
self.conf = twint.Config()
|
22 |
|
23 |
-
def scrape_by_user(self, _user):
|
24 |
"""This method uses twint to extract tweets based on username"""
|
25 |
self.conf.Search = "from:@" + _user # is the search configuration is given in this format it searches after
|
26 |
# user_names.
|
@@ -46,13 +47,14 @@ class TwitterScraper(object):
|
|
46 |
self.conf.Search = _string # this tells twint configuration to search for string
|
47 |
return self.__get_tweets__from_twint__()
|
48 |
|
|
|
49 |
def scrape_by_user_and_string(self, _user: str, _string: str):
|
50 |
-
"""This method uses twint to extract tweets
|
51 |
self.conf.Username = _user
|
52 |
self.conf.Search = _string
|
53 |
return self.__get_tweets__from_twint__()
|
54 |
|
55 |
-
# TODO: make method static
|
56 |
def get_only_tweets(self, tweet_and_replies_info):
|
57 |
tweet_and_replies = tweet_and_replies_info["tweet"]
|
58 |
"""
|
@@ -89,12 +91,16 @@ class TwitterScraper(object):
|
|
89 |
self.conf.Limit = self.num_tweets # specifies how many tweet should be scraped
|
90 |
self.conf.Since = self.from_date
|
91 |
self.conf.Until = self.to_date
|
92 |
-
self.conf.Hide_output = True # Hides the output. If set to False it will
|
93 |
twint.run.Search(self.conf)
|
94 |
tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output souldwe dataframe.
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
97 |
return tweet_and_replies_inf
|
|
|
98 |
# def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
|
99 |
# is not string it generates exception print("[!] Please make sure the date is a string in this format
|
100 |
# \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
|
@@ -105,9 +111,18 @@ class TwitterScraper(object):
|
|
105 |
return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
|
106 |
self.num_tweets)
|
107 |
if __name__ == "__main__":
|
|
|
108 |
sc = TwitterScraper(from_date="2022-05-01", to_date="2022-07-31", num_tweets=40)
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
|
|
|
1 |
+
import pandas as pd
|
2 |
import twint
|
3 |
from datetime import date
|
4 |
|
|
|
21 |
self.num_tweets = num_tweets
|
22 |
self.conf = twint.Config()
|
23 |
|
24 |
+
def scrape_by_user(self, _user: str):
|
25 |
"""This method uses twint to extract tweets based on username"""
|
26 |
self.conf.Search = "from:@" + _user # is the search configuration is given in this format it searches after
|
27 |
# user_names.
|
|
|
47 |
self.conf.Search = _string # this tells twint configuration to search for string
|
48 |
return self.__get_tweets__from_twint__()
|
49 |
|
50 |
+
# TODO: Possibly include more than one user
|
51 |
def scrape_by_user_and_string(self, _user: str, _string: str):
|
52 |
+
"""This method uses twint to extract tweets based on string and username"""
|
53 |
self.conf.Username = _user
|
54 |
self.conf.Search = _string
|
55 |
return self.__get_tweets__from_twint__()
|
56 |
|
57 |
+
# TODO: make method static (Possibly remove this)
|
58 |
def get_only_tweets(self, tweet_and_replies_info):
|
59 |
tweet_and_replies = tweet_and_replies_info["tweet"]
|
60 |
"""
|
|
|
91 |
self.conf.Limit = self.num_tweets # specifies how many tweet should be scraped
|
92 |
self.conf.Since = self.from_date
|
93 |
self.conf.Until = self.to_date
|
94 |
+
self.conf.Hide_output = True # Hides the output. If set to False it will print tweets in the terminal window.
|
95 |
twint.run.Search(self.conf)
|
96 |
tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output souldwe dataframe.
|
97 |
+
if tweet_and_replies_inf.empty:
|
98 |
+
print("No tweet containing the word \"" + self.conf.Search + "\" could be found!")
|
99 |
+
else:
|
100 |
+
tweet_and_replies_inf = tweet_and_replies_inf[
|
101 |
+
["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
|
102 |
return tweet_and_replies_inf
|
103 |
+
|
104 |
# def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
|
105 |
# is not string it generates exception print("[!] Please make sure the date is a string in this format
|
106 |
# \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
|
|
|
111 |
return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
|
112 |
self.num_tweets)
|
113 |
if __name__ == "__main__":
|
114 |
+
user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM', 'dadgostarnooshi']
|
115 |
sc = TwitterScraper(from_date="2022-05-01", to_date="2022-07-31", num_tweets=40)
|
116 |
+
dc1 = sc.scrape_by_user("jimmieakesson")
|
117 |
+
dc2 = sc.scrape_by_user_and_string("jimmieakesson", "pension")
|
118 |
+
dc3 = sc.scrape_by_several_users(user_list)
|
119 |
+
print(dc2)
|
120 |
+
print(dc1.head())
|
121 |
+
print(dc3.head)
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
|
126 |
+
|
127 |
|
128 |
|