Spaces:

politweet-sh
/

politweet

Runtime error

App Files Files Community

politweet / tests /scraper_test.py

olofbengtsson

More tests for scraper_test.py and handling of scrapes with no results in TwitterScraper.py

58a2a29 about 2 years ago

raw

history blame

7.11 kB

	import re
	import unittest

	import pandas as pd
	from datetime import datetime
	import sys
	from pathlib import Path
	sys.path.insert(0, str(Path(__file__).parents[1]) + "/twitterscraper")

	from TwitterScraper import TwitterScraper # Detta är inget problem, den hittar filen

	class MyTestCase(unittest.TestCase):

	@classmethod
	def setUpClass(cls):
	print('Super set up')
	num_tweets = 40
	from_date = "2022-05-01"
	to_date = "2022-07-31"
	user = 'jimmieakesson'
	user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM', 'dadgostarnooshi']
	sc1 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
	sc2 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
	sc3 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
	search_string = 'miljö'
	cls._df_uni = sc1.scrape_by_user(user)
	cls._df_poly = sc2.scrape_by_several_users(user_list)
	cls._df_by_string = sc3.scrape_by_string(search_string)

	nbr_of_cols = 9

	def setUp(self):
	print('set up')
	self.num_tweets = 40
	self.from_date = "2022-05-01"
	self.to_date = "2022-07-31"
	self.user = 'jimmieakesson'
	self.user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
	'dadgostarnooshi']
	self.search_string = 'miljö'
	# self.sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
	nbr_of_cols = 9

	# Checks that the returned datatype is pandas DataFrame
	def test_correct_type(self):
	print('Checking type...')
	self.assertEqual(type(self._df_uni), type(pd.DataFrame()))
	self.assertEqual(type(self._df_poly), type(pd.DataFrame()))
	self.assertEqual(type(self._df_by_string), type(pd.DataFrame()))

	# Checks that we get the correct number of tweets
	# OBS, FOR NOW IT ONLY CHECKS THAT WE DON'T OVER SAMPLE
	def test_correct_nbr_tweets(self):
	print('Checking number of tweets...')
	self.assertTrue(self._df_uni.shape[0] <= self.num_tweets)
	self.assertTrue(self._df_poly.shape[0] <= self.num_tweets)
	self.assertTrue(self._df_by_string.shape[0] <= self.num_tweets)

	# Checks that all dates are between the start date and the end date
	def test_dates(self):
	print('Checking dates...')
	d_start = datetime.fromisoformat(self.from_date)
	d_end = datetime.fromisoformat(self.to_date)
	correct_date_uni = True
	correct_date_poly = True
	correct_date_by_string = True
	for date in self._df_uni.date:
	d = datetime.fromisoformat(date)
	if not (d >= d_start and d <= d_end):
	correct_date_uni = False
	break
	for date in self._df_poly.date:
	d = datetime.fromisoformat(date)
	if not (d >= d_start and d <= d_end):
	correct_date_poly = False
	break
	for date in self._df_by_string.date:
	d = datetime.fromisoformat(date)
	if not (d >= d_start and d <= d_end):
	correct_date_by_string = False
	break

	self.assertTrue(correct_date_uni)
	self.assertTrue(correct_date_poly)
	self.assertTrue(correct_date_by_string)

	# Checks that all tweets are from the correct user
	def test_user(self):
	print('Checking user...')
	same_user = True
	for username in self._df_uni.username:
	if not username == self.user:
	same_user = False
	break

	self.assertTrue(same_user)

	# Checks that all user_ids are correct
	def test_user_id(self):
	print('Checking user ids...')
	same_user = True
	first_id = self._df_uni['user_id'][0]
	for user_id in self._df_uni.user_id:
	if not user_id == first_id:
	same_user = False
	break
	self.assertTrue(same_user)

	# Checks if there are tweets that have been sampled several times
	def test_no_doubles(self):
	print('Checking doubles...')
	id_set_uni = set(self._df_uni.id)
	id_set_poly = set(self._df_poly.id)
	id_set_by_string = set(self._df_by_string.id)
	self.assertTrue(len(id_set_uni) == self._df_uni.shape[0])
	self.assertTrue(len(id_set_poly) == self._df_poly.shape[0])
	self.assertTrue(len(id_set_by_string) == self._df_by_string.shape[0])

	# Checks that we have no None entries
	def test_none(self):
	print('Checking Nones...')
	self.assertFalse(any(b == True for b in self._df_uni.isnull()))
	self.assertFalse(any(b == True for b in self._df_poly.isnull()))
	self.assertFalse(any(b == True for b in self._df_by_string.isnull()))

	def test_no_url_tweets(self):
	print('Checking url tweets...')
	only_url_uni = False
	only_url_poly = False
	only_url_by_string = False
	for tweet in self._df_uni.tweet:
	if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
	print(tweet.split())
	only_url_uni = True
	break
	for tweet in self._df_poly.tweet:
	if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
	print(tweet.split())
	only_url_poly = True
	break
	for tweet in self._df_by_string.tweet:
	if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
	print(tweet.split())
	only_url_by_string = True
	break
	self.assertFalse(only_url_uni)
	self.assertFalse(only_url_poly)
	self.assertFalse(only_url_by_string)

	def test_many_users(self):
	correct_users = True
	for user in self._df_uni.username:
	if user not in self.user_list:
	correct_users = False
	break
	self.assertTrue(correct_users)

	def test_many_user_ids(self):
	correct_ids = True
	user_dict = {}
	for i in range(self._df_uni.shape[0]):
	if self._df_uni['username'][i] not in user_dict:
	user_dict[self._df_uni['username'][i]] = self._df_uni['user_id'][i]
	if self._df_uni['user_id'][i] != user_dict[self._df_uni['username'][i]]:
	correct_ids = False
	break
	self.assertTrue(correct_ids)

	def test_string_search(self):
	correct_search = True
	search = re.sub('ö', 'ø', self.search_string)
	search = re.sub('ä', 'æ', search)
	search_strings = [self.search_string, search]
	for tweet in self._df_by_string.tweet:
	if all(search not in tweet.lower() for search in search_strings):
	correct_search = False
	self.assertTrue(correct_search)



	if __name__ == '__main__':
	unittest.main()