Spaces:

politweet-sh
/

politweet

Runtime error

App Files Files Community

politweet / tests /scraper_test.py

Demea9000

cleaned up unittests

ba446a6 over 2 years ago

raw

history blame

7.56 kB

	import re
	import unittest

	import pandas as pd
	from datetime import datetime
	import sys
	from pathlib import Path
	from twitterscraper import TwitterScraper as ts

	sys.path.insert(0, str(Path(__file__).parents[1]) + "/twitterscraper")


	class MyTestCase(unittest.TestCase):

	@classmethod
	def setUpClass(cls):
	print('Super set up')
	num_tweets = 40
	from_date = "2022-05-01"
	to_date = "2022-07-31"
	user = 'jimmieakesson'
	user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
	'dadgostarnooshi']
	sc1 = ts.TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
	sc2 = ts.TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
	sc3 = ts.TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
	search_string = 'miljö'
	cls._df_uni = sc1.scrape_by_user(user)
	cls._df_poly = sc2.scrape_by_several_users(user_list)
	cls._df_by_string = sc3.scrape_by_string(search_string)

	nbr_of_cols = 9

	def setUp(self):
	print('set up')
	self.num_tweets = 40
	self.from_date = "2022-05-01"
	self.to_date = "2022-07-31"
	self.user = 'jimmieakesson'
	self.user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi',
	'SwedishPM',
	'dadgostarnooshi']
	self.search_string = 'miljö'
	# self.sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
	nbr_of_cols = 9

	def test_correct_type(self):
	"""
	Checks that the returned datatype is pandas DataFrame
	:return:
	"""
	print('Checking type...')
	self.assertEqual(type(self._df_uni), type(pd.DataFrame()))
	self.assertEqual(type(self._df_poly), type(pd.DataFrame()))
	self.assertEqual(type(self._df_by_string), type(pd.DataFrame()))

	def test_correct_nbr_tweets(self):
	"""
	Checks that we get the correct number of tweets.
	OBS FOR NOW IT ONLY CHECKS THAT WE DON'T OVER SAMPLE
	TODO: Check that we get the correct number of tweets.
	:return:
	"""
	print('Checking number of tweets...')
	self.assertTrue(self._df_uni.shape[0] <= self.num_tweets)
	self.assertTrue(self._df_poly.shape[0] <= self.num_tweets)
	self.assertTrue(self._df_by_string.shape[0] <= self.num_tweets)

	def test_dates(self):
	"""
	Checks that all dates are between the start date and the end date
	:return:
	"""
	print('Checking dates...')
	d_start = datetime.fromisoformat(self.from_date)
	d_end = datetime.fromisoformat(self.to_date)
	correct_date_uni = True
	correct_date_poly = True
	correct_date_by_string = True
	for date in self._df_uni.date:
	d = datetime.fromisoformat(date)
	if not (d >= d_start and d <= d_end):
	correct_date_uni = False
	break
	for date in self._df_poly.date:
	d = datetime.fromisoformat(date)
	if not (d >= d_start and d <= d_end):
	correct_date_poly = False
	break
	for date in self._df_by_string.date:
	d = datetime.fromisoformat(date)
	if not (d >= d_start and d <= d_end):
	correct_date_by_string = False
	break

	self.assertTrue(correct_date_uni)
	self.assertTrue(correct_date_poly)
	self.assertTrue(correct_date_by_string)

	def test_user(self):
	"""
	Checks that all tweets are from the correct user
	:return:
	"""
	print('Checking user...')
	same_user = True
	for username in self._df_uni.username:
	if not username == self.user:
	same_user = False
	break

	self.assertTrue(same_user)

	def test_user_id(self):
	"""
	Checks that all user_ids are correct
	:return:
	"""
	print('Checking user ids...')
	same_user = True
	first_id = self._df_uni['user_id'][0]
	for user_id in self._df_uni.user_id:
	if not user_id == first_id:
	same_user = False
	break
	self.assertTrue(same_user)

	def test_no_doubles(self):
	"""
	Checks that there are no tweets that have been sampled several times
	:return:
	"""
	print('Checking doubles...')
	id_set_uni = set(self._df_uni.id)
	id_set_poly = set(self._df_poly.id)
	id_set_by_string = set(self._df_by_string.id)
	self.assertTrue(len(id_set_uni) == self._df_uni.shape[0])
	self.assertTrue(len(id_set_poly) == self._df_poly.shape[0])
	self.assertTrue(len(id_set_by_string) == self._df_by_string.shape[0])

	def test_none(self):
	"""
	Checks that there are no None entries
	:return:
	"""
	print('Checking Nones...')
	self.assertFalse(any(b == True for b in self._df_uni.isnull()))
	self.assertFalse(any(b == True for b in self._df_poly.isnull()))
	self.assertFalse(any(b == True for b in self._df_by_string.isnull()))

	def test_no_url_tweets(self):
	print('Checking url tweets...')
	only_url_uni = False
	only_url_poly = False
	only_url_by_string = False
	for tweet in self._df_uni.tweet:
	if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
	print(tweet.split())
	only_url_uni = True
	break
	for tweet in self._df_poly.tweet:
	if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
	print(tweet.split())
	only_url_poly = True
	break
	for tweet in self._df_by_string.tweet:
	if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
	print(tweet.split())
	only_url_by_string = True
	break
	self.assertFalse(only_url_uni)
	self.assertFalse(only_url_poly)
	self.assertFalse(only_url_by_string)

	def test_many_users(self):
	correct_users = True
	for user in self._df_uni.username:
	if user not in self.user_list:
	correct_users = False
	break
	self.assertTrue(correct_users)

	def test_many_user_ids(self):
	correct_ids = True
	user_dict = {}
	for i in range(self._df_uni.shape[0]):
	if self._df_uni['username'][i] not in user_dict:
	user_dict[self._df_uni['username'][i]] = self._df_uni['user_id'][i]
	if self._df_uni['user_id'][i] != user_dict[self._df_uni['username'][i]]:
	correct_ids = False
	break
	self.assertTrue(correct_ids)

	def test_string_search(self):
	"""
	TODO: Check this
	:return:
	"""
	correct_search = True
	search = re.sub('ö', 'ø', self.search_string)
	search = re.sub('ä', 'æ', search)
	search_strings = [self.search_string, search]
	for tweet in self._df_by_string.tweet:
	if all(search not in tweet.lower() for search in search_strings):
	correct_search = False
	self.assertTrue(correct_search)


	if __name__ == '__main__':
	unittest.main()