politweet / tests /scraper_test.py
Demea9000's picture
cleaned up unittests
ba446a6
raw
history blame
7.56 kB
import re
import unittest
import pandas as pd
from datetime import datetime
import sys
from pathlib import Path
from twitterscraper import TwitterScraper as ts
sys.path.insert(0, str(Path(__file__).parents[1]) + "/twitterscraper")
class MyTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
print('Super set up')
num_tweets = 40
from_date = "2022-05-01"
to_date = "2022-07-31"
user = 'jimmieakesson'
user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
'dadgostarnooshi']
sc1 = ts.TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
sc2 = ts.TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
sc3 = ts.TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
search_string = 'miljö'
cls._df_uni = sc1.scrape_by_user(user)
cls._df_poly = sc2.scrape_by_several_users(user_list)
cls._df_by_string = sc3.scrape_by_string(search_string)
nbr_of_cols = 9
def setUp(self):
print('set up')
self.num_tweets = 40
self.from_date = "2022-05-01"
self.to_date = "2022-07-31"
self.user = 'jimmieakesson'
self.user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi',
'SwedishPM',
'dadgostarnooshi']
self.search_string = 'miljö'
# self.sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
nbr_of_cols = 9
def test_correct_type(self):
"""
Checks that the returned datatype is pandas DataFrame
:return:
"""
print('Checking type...')
self.assertEqual(type(self._df_uni), type(pd.DataFrame()))
self.assertEqual(type(self._df_poly), type(pd.DataFrame()))
self.assertEqual(type(self._df_by_string), type(pd.DataFrame()))
def test_correct_nbr_tweets(self):
"""
Checks that we get the correct number of tweets.
OBS FOR NOW IT ONLY CHECKS THAT WE DON'T OVER SAMPLE
TODO: Check that we get the correct number of tweets.
:return:
"""
print('Checking number of tweets...')
self.assertTrue(self._df_uni.shape[0] <= self.num_tweets)
self.assertTrue(self._df_poly.shape[0] <= self.num_tweets)
self.assertTrue(self._df_by_string.shape[0] <= self.num_tweets)
def test_dates(self):
"""
Checks that all dates are between the start date and the end date
:return:
"""
print('Checking dates...')
d_start = datetime.fromisoformat(self.from_date)
d_end = datetime.fromisoformat(self.to_date)
correct_date_uni = True
correct_date_poly = True
correct_date_by_string = True
for date in self._df_uni.date:
d = datetime.fromisoformat(date)
if not (d >= d_start and d <= d_end):
correct_date_uni = False
break
for date in self._df_poly.date:
d = datetime.fromisoformat(date)
if not (d >= d_start and d <= d_end):
correct_date_poly = False
break
for date in self._df_by_string.date:
d = datetime.fromisoformat(date)
if not (d >= d_start and d <= d_end):
correct_date_by_string = False
break
self.assertTrue(correct_date_uni)
self.assertTrue(correct_date_poly)
self.assertTrue(correct_date_by_string)
def test_user(self):
"""
Checks that all tweets are from the correct user
:return:
"""
print('Checking user...')
same_user = True
for username in self._df_uni.username:
if not username == self.user:
same_user = False
break
self.assertTrue(same_user)
def test_user_id(self):
"""
Checks that all user_ids are correct
:return:
"""
print('Checking user ids...')
same_user = True
first_id = self._df_uni['user_id'][0]
for user_id in self._df_uni.user_id:
if not user_id == first_id:
same_user = False
break
self.assertTrue(same_user)
def test_no_doubles(self):
"""
Checks that there are no tweets that have been sampled several times
:return:
"""
print('Checking doubles...')
id_set_uni = set(self._df_uni.id)
id_set_poly = set(self._df_poly.id)
id_set_by_string = set(self._df_by_string.id)
self.assertTrue(len(id_set_uni) == self._df_uni.shape[0])
self.assertTrue(len(id_set_poly) == self._df_poly.shape[0])
self.assertTrue(len(id_set_by_string) == self._df_by_string.shape[0])
def test_none(self):
"""
Checks that there are no None entries
:return:
"""
print('Checking Nones...')
self.assertFalse(any(b == True for b in self._df_uni.isnull()))
self.assertFalse(any(b == True for b in self._df_poly.isnull()))
self.assertFalse(any(b == True for b in self._df_by_string.isnull()))
def test_no_url_tweets(self):
print('Checking url tweets...')
only_url_uni = False
only_url_poly = False
only_url_by_string = False
for tweet in self._df_uni.tweet:
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
print(tweet.split())
only_url_uni = True
break
for tweet in self._df_poly.tweet:
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
print(tweet.split())
only_url_poly = True
break
for tweet in self._df_by_string.tweet:
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
print(tweet.split())
only_url_by_string = True
break
self.assertFalse(only_url_uni)
self.assertFalse(only_url_poly)
self.assertFalse(only_url_by_string)
def test_many_users(self):
correct_users = True
for user in self._df_uni.username:
if user not in self.user_list:
correct_users = False
break
self.assertTrue(correct_users)
def test_many_user_ids(self):
correct_ids = True
user_dict = {}
for i in range(self._df_uni.shape[0]):
if self._df_uni['username'][i] not in user_dict:
user_dict[self._df_uni['username'][i]] = self._df_uni['user_id'][i]
if self._df_uni['user_id'][i] != user_dict[self._df_uni['username'][i]]:
correct_ids = False
break
self.assertTrue(correct_ids)
def test_string_search(self):
"""
TODO: Check this
:return:
"""
correct_search = True
search = re.sub('ö', 'ø', self.search_string)
search = re.sub('ä', 'æ', search)
search_strings = [self.search_string, search]
for tweet in self._df_by_string.tweet:
if all(search not in tweet.lower() for search in search_strings):
correct_search = False
self.assertTrue(correct_search)
if __name__ == '__main__':
unittest.main()