politweet / tests /scraper_test.py
olofbengtsson's picture
Added test folder with some tests for the scraper and a file to start writing in for the classifier
f8ad876
import unittest
import pandas as pd
from datetime import datetime
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parents[1]) + "/twitterscraper")
from TwitterScraper import TwitterScraper # Detta är inget problem, den hittar filen
class MyTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
print('Super set up')
num_tweets = 40
from_date = "2022-05-01"
to_date = "2022-07-31"
user = 'jimmieakesson'
user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM', 'dadgostarnooshi']
sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
cls._df_uni = sc.scrape_by_user(user)
nbr_of_cols = 9
def setUp(self):
print('set up')
self.num_tweets = 40
self.from_date = "2022-05-01"
self.to_date = "2022-07-31"
self.user = 'jimmieakesson'
self.user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
'dadgostarnooshi']
# self.sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
nbr_of_cols = 9
# Checks that the returned datatype is pandas DataFrame
def test_correct_type(self):
print('Checking type...')
self.assertEqual(type(self._df_uni), type(pd.DataFrame())) # add assertion here
# Checks that we get the correct number of tweets
# OBS, FOR NOW IT ONLY CHECKS THAT WE DON'T OVERSAMPLE
def test_correct_nbr_tweets(self):
print('Checking number of tweets...')
self.assertTrue(self._df_uni.shape[0] < self.num_tweets)
# Checks that all dates are between the start date and the end date
def test_dates(self):
print('Checking dates...')
d_start = datetime.fromisoformat(self.from_date)
d_end = datetime.fromisoformat(self.to_date)
correct_date = True
for date in self._df_uni.date:
d = datetime.fromisoformat(date)
if not (d >= d_start and d <= d_end):
correct_date = False
break
self.assertTrue(correct_date)
# Checks that all tweets are from the correct user
def test_user(self):
print('Checking user...')
same_user = True
for username in self._df_uni.username:
if not username == self.user:
same_user = False
break
self.assertTrue(same_user)
# Checks that all user_ids are correct
def test_user_id(self):
print('Checking user ids...')
same_user = True
first_id = self._df_uni['user_id'][0]
for user_id in self._df_uni.user_id:
if not user_id == first_id:
same_user = False
break
self.assertTrue(same_user)
# Checks if there are tweets that have been sampled several times
def test_no_doubles(self):
print('Checking doubles...')
id_set = set(self._df_uni.id)
self.assertTrue(len(id_set) == self._df_uni.shape[0])
# Checks that we have no None entries
def test_none(self):
print('Checking Nones...')
self.assertFalse(any(b == True for b in self._df_uni.isnull()))
def test_no_url_tweets(self):
print('Checking url tweets...')
only_url = False
for tweet in self._df_uni.tweet:
if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
print(tweet.split())
only_url = True
break
self.assertFalse(only_url)
if __name__ == '__main__':
unittest.main()