politweet / tests /scraper_test.py
olofbengtsson's picture
More tests for scraper_test.py and handling of scrapes with no results in TwitterScraper.py
58a2a29
raw
history blame
7.11 kB
import re
import unittest
import pandas as pd
from datetime import datetime
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parents[1]) + "/twitterscraper")
from TwitterScraper import TwitterScraper # Detta är inget problem, den hittar filen
class MyTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
print('Super set up')
num_tweets = 40
from_date = "2022-05-01"
to_date = "2022-07-31"
user = 'jimmieakesson'
user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM', 'dadgostarnooshi']
sc1 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
sc2 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
sc3 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
search_string = 'miljö'
cls._df_uni = sc1.scrape_by_user(user)
cls._df_poly = sc2.scrape_by_several_users(user_list)
cls._df_by_string = sc3.scrape_by_string(search_string)
nbr_of_cols = 9
def setUp(self):
print('set up')
self.num_tweets = 40
self.from_date = "2022-05-01"
self.to_date = "2022-07-31"
self.user = 'jimmieakesson'
self.user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
'dadgostarnooshi']
self.search_string = 'miljö'
# self.sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
nbr_of_cols = 9
# Checks that the returned datatype is pandas DataFrame
def test_correct_type(self):
print('Checking type...')
self.assertEqual(type(self._df_uni), type(pd.DataFrame()))
self.assertEqual(type(self._df_poly), type(pd.DataFrame()))
self.assertEqual(type(self._df_by_string), type(pd.DataFrame()))
# Checks that we get the correct number of tweets
# OBS, FOR NOW IT ONLY CHECKS THAT WE DON'T OVER SAMPLE
def test_correct_nbr_tweets(self):
print('Checking number of tweets...')
self.assertTrue(self._df_uni.shape[0] <= self.num_tweets)
self.assertTrue(self._df_poly.shape[0] <= self.num_tweets)
self.assertTrue(self._df_by_string.shape[0] <= self.num_tweets)
# Checks that all dates are between the start date and the end date
def test_dates(self):
print('Checking dates...')
d_start = datetime.fromisoformat(self.from_date)
d_end = datetime.fromisoformat(self.to_date)
correct_date_uni = True
correct_date_poly = True
correct_date_by_string = True
for date in self._df_uni.date:
d = datetime.fromisoformat(date)
if not (d >= d_start and d <= d_end):
correct_date_uni = False
break
for date in self._df_poly.date:
d = datetime.fromisoformat(date)
if not (d >= d_start and d <= d_end):
correct_date_poly = False
break
for date in self._df_by_string.date:
d = datetime.fromisoformat(date)
if not (d >= d_start and d <= d_end):
correct_date_by_string = False
break
self.assertTrue(correct_date_uni)
self.assertTrue(correct_date_poly)
self.assertTrue(correct_date_by_string)
# Checks that all tweets are from the correct user
def test_user(self):
print('Checking user...')
same_user = True
for username in self._df_uni.username:
if not username == self.user:
same_user = False
break
self.assertTrue(same_user)
# Checks that all user_ids are correct
def test_user_id(self):
print('Checking user ids...')
same_user = True
first_id = self._df_uni['user_id'][0]
for user_id in self._df_uni.user_id:
if not user_id == first_id:
same_user = False
break
self.assertTrue(same_user)
# Checks if there are tweets that have been sampled several times
def test_no_doubles(self):
print('Checking doubles...')
id_set_uni = set(self._df_uni.id)
id_set_poly = set(self._df_poly.id)
id_set_by_string = set(self._df_by_string.id)
self.assertTrue(len(id_set_uni) == self._df_uni.shape[0])
self.assertTrue(len(id_set_poly) == self._df_poly.shape[0])
self.assertTrue(len(id_set_by_string) == self._df_by_string.shape[0])
# Checks that we have no None entries
def test_none(self):
print('Checking Nones...')
self.assertFalse(any(b == True for b in self._df_uni.isnull()))
self.assertFalse(any(b == True for b in self._df_poly.isnull()))
self.assertFalse(any(b == True for b in self._df_by_string.isnull()))
def test_no_url_tweets(self):
print('Checking url tweets...')
only_url_uni = False
only_url_poly = False
only_url_by_string = False
for tweet in self._df_uni.tweet:
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
print(tweet.split())
only_url_uni = True
break
for tweet in self._df_poly.tweet:
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
print(tweet.split())
only_url_poly = True
break
for tweet in self._df_by_string.tweet:
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
print(tweet.split())
only_url_by_string = True
break
self.assertFalse(only_url_uni)
self.assertFalse(only_url_poly)
self.assertFalse(only_url_by_string)
def test_many_users(self):
correct_users = True
for user in self._df_uni.username:
if user not in self.user_list:
correct_users = False
break
self.assertTrue(correct_users)
def test_many_user_ids(self):
correct_ids = True
user_dict = {}
for i in range(self._df_uni.shape[0]):
if self._df_uni['username'][i] not in user_dict:
user_dict[self._df_uni['username'][i]] = self._df_uni['user_id'][i]
if self._df_uni['user_id'][i] != user_dict[self._df_uni['username'][i]]:
correct_ids = False
break
self.assertTrue(correct_ids)
def test_string_search(self):
correct_search = True
search = re.sub('ö', 'ø', self.search_string)
search = re.sub('ä', 'æ', search)
search_strings = [self.search_string, search]
for tweet in self._df_by_string.tweet:
if all(search not in tweet.lower() for search in search_strings):
correct_search = False
self.assertTrue(correct_search)
if __name__ == '__main__':
unittest.main()