Spaces:
Runtime error
Runtime error
File size: 7,564 Bytes
58a2a29 4795530 f8ad876 ba446a6 f8ad876 4795530 ba446a6 58a2a29 4795530 ba446a6 4795530 ba446a6 4795530 58a2a29 4795530 ba446a6 4795530 58a2a29 4795530 ba446a6 4795530 58a2a29 4795530 58a2a29 4795530 58a2a29 4795530 ba446a6 4795530 ba446a6 4795530 ba446a6 4795530 58a2a29 4795530 ba446a6 4795530 58a2a29 4795530 58a2a29 4795530 58a2a29 4795530 58a2a29 ba446a6 58a2a29 4795530 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
import re
import unittest
import pandas as pd
from datetime import datetime
import sys
from pathlib import Path
from twitterscraper import TwitterScraper as ts
sys.path.insert(0, str(Path(__file__).parents[1]) + "/twitterscraper")
class MyTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
print('Super set up')
num_tweets = 40
from_date = "2022-05-01"
to_date = "2022-07-31"
user = 'jimmieakesson'
user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
'dadgostarnooshi']
sc1 = ts.TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
sc2 = ts.TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
sc3 = ts.TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
search_string = 'miljö'
cls._df_uni = sc1.scrape_by_user(user)
cls._df_poly = sc2.scrape_by_several_users(user_list)
cls._df_by_string = sc3.scrape_by_string(search_string)
nbr_of_cols = 9
def setUp(self):
print('set up')
self.num_tweets = 40
self.from_date = "2022-05-01"
self.to_date = "2022-07-31"
self.user = 'jimmieakesson'
self.user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi',
'SwedishPM',
'dadgostarnooshi']
self.search_string = 'miljö'
# self.sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
nbr_of_cols = 9
def test_correct_type(self):
"""
Checks that the returned datatype is pandas DataFrame
:return:
"""
print('Checking type...')
self.assertEqual(type(self._df_uni), type(pd.DataFrame()))
self.assertEqual(type(self._df_poly), type(pd.DataFrame()))
self.assertEqual(type(self._df_by_string), type(pd.DataFrame()))
def test_correct_nbr_tweets(self):
"""
Checks that we get the correct number of tweets.
OBS FOR NOW IT ONLY CHECKS THAT WE DON'T OVER SAMPLE
TODO: Check that we get the correct number of tweets.
:return:
"""
print('Checking number of tweets...')
self.assertTrue(self._df_uni.shape[0] <= self.num_tweets)
self.assertTrue(self._df_poly.shape[0] <= self.num_tweets)
self.assertTrue(self._df_by_string.shape[0] <= self.num_tweets)
def test_dates(self):
"""
Checks that all dates are between the start date and the end date
:return:
"""
print('Checking dates...')
d_start = datetime.fromisoformat(self.from_date)
d_end = datetime.fromisoformat(self.to_date)
correct_date_uni = True
correct_date_poly = True
correct_date_by_string = True
for date in self._df_uni.date:
d = datetime.fromisoformat(date)
if not (d >= d_start and d <= d_end):
correct_date_uni = False
break
for date in self._df_poly.date:
d = datetime.fromisoformat(date)
if not (d >= d_start and d <= d_end):
correct_date_poly = False
break
for date in self._df_by_string.date:
d = datetime.fromisoformat(date)
if not (d >= d_start and d <= d_end):
correct_date_by_string = False
break
self.assertTrue(correct_date_uni)
self.assertTrue(correct_date_poly)
self.assertTrue(correct_date_by_string)
def test_user(self):
"""
Checks that all tweets are from the correct user
:return:
"""
print('Checking user...')
same_user = True
for username in self._df_uni.username:
if not username == self.user:
same_user = False
break
self.assertTrue(same_user)
def test_user_id(self):
"""
Checks that all user_ids are correct
:return:
"""
print('Checking user ids...')
same_user = True
first_id = self._df_uni['user_id'][0]
for user_id in self._df_uni.user_id:
if not user_id == first_id:
same_user = False
break
self.assertTrue(same_user)
def test_no_doubles(self):
"""
Checks that there are no tweets that have been sampled several times
:return:
"""
print('Checking doubles...')
id_set_uni = set(self._df_uni.id)
id_set_poly = set(self._df_poly.id)
id_set_by_string = set(self._df_by_string.id)
self.assertTrue(len(id_set_uni) == self._df_uni.shape[0])
self.assertTrue(len(id_set_poly) == self._df_poly.shape[0])
self.assertTrue(len(id_set_by_string) == self._df_by_string.shape[0])
def test_none(self):
"""
Checks that there are no None entries
:return:
"""
print('Checking Nones...')
self.assertFalse(any(b == True for b in self._df_uni.isnull()))
self.assertFalse(any(b == True for b in self._df_poly.isnull()))
self.assertFalse(any(b == True for b in self._df_by_string.isnull()))
def test_no_url_tweets(self):
print('Checking url tweets...')
only_url_uni = False
only_url_poly = False
only_url_by_string = False
for tweet in self._df_uni.tweet:
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
print(tweet.split())
only_url_uni = True
break
for tweet in self._df_poly.tweet:
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
print(tweet.split())
only_url_poly = True
break
for tweet in self._df_by_string.tweet:
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
print(tweet.split())
only_url_by_string = True
break
self.assertFalse(only_url_uni)
self.assertFalse(only_url_poly)
self.assertFalse(only_url_by_string)
def test_many_users(self):
correct_users = True
for user in self._df_uni.username:
if user not in self.user_list:
correct_users = False
break
self.assertTrue(correct_users)
def test_many_user_ids(self):
correct_ids = True
user_dict = {}
for i in range(self._df_uni.shape[0]):
if self._df_uni['username'][i] not in user_dict:
user_dict[self._df_uni['username'][i]] = self._df_uni['user_id'][i]
if self._df_uni['user_id'][i] != user_dict[self._df_uni['username'][i]]:
correct_ids = False
break
self.assertTrue(correct_ids)
def test_string_search(self):
"""
TODO: Check this
:return:
"""
correct_search = True
search = re.sub('ö', 'ø', self.search_string)
search = re.sub('ä', 'æ', search)
search_strings = [self.search_string, search]
for tweet in self._df_by_string.tweet:
if all(search not in tweet.lower() for search in search_strings):
correct_search = False
self.assertTrue(correct_search)
if __name__ == '__main__':
unittest.main()
|