Spaces:

politweet-sh
/

politweet

Runtime error

App Files Files Community

olofbengtsson commited on Jul 14, 2022

Commit

58a2a29

•

1 Parent(s): f8ad876

More tests for scraper_test.py and handling of scrapes with no results in TwitterScraper.py

Browse files

Files changed (2) hide show

tests/scraper_test.py +88 -13
twitterscraper/TwitterScraper.py +24 -9

tests/scraper_test.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import unittest
 import pandas as pd
@@ -18,8 +19,14 @@ class MyTestCase(unittest.TestCase):
         to_date = "2022-07-31"
         user = 'jimmieakesson'
         user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM', 'dadgostarnooshi']
-        sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
-        cls._df_uni = sc.scrape_by_user(user)
         nbr_of_cols = 9
     def setUp(self):
@@ -30,33 +37,52 @@ class MyTestCase(unittest.TestCase):
         self.user = 'jimmieakesson'
         self.user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
                      'dadgostarnooshi']
         # self.sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
         nbr_of_cols = 9
     # Checks that the returned datatype is pandas DataFrame
     def test_correct_type(self):
         print('Checking type...')
-        self.assertEqual(type(self._df_uni), type(pd.DataFrame()))  # add assertion here
     # Checks that we get the correct number of tweets
-    # OBS, FOR NOW IT ONLY CHECKS THAT WE DON'T OVERSAMPLE
     def test_correct_nbr_tweets(self):
         print('Checking number of tweets...')
-        self.assertTrue(self._df_uni.shape[0] < self.num_tweets)
     # Checks that all dates are between the start date and the end date
     def test_dates(self):
         print('Checking dates...')
         d_start = datetime.fromisoformat(self.from_date)
         d_end = datetime.fromisoformat(self.to_date)
-        correct_date = True
         for date in self._df_uni.date:
             d = datetime.fromisoformat(date)
             if not (d >= d_start and d <= d_end):
-                correct_date = False
                 break
-        self.assertTrue(correct_date)
     # Checks that all tweets are from the correct user
     def test_user(self):
@@ -83,23 +109,72 @@ class MyTestCase(unittest.TestCase):
     # Checks if there are tweets that have been sampled several times
     def test_no_doubles(self):
         print('Checking doubles...')
-        id_set = set(self._df_uni.id)
-        self.assertTrue(len(id_set) == self._df_uni.shape[0])
     # Checks that we have no None entries
     def test_none(self):
         print('Checking Nones...')
         self.assertFalse(any(b == True for b in self._df_uni.isnull()))
     def test_no_url_tweets(self):
         print('Checking url tweets...')
-        only_url = False
         for tweet in self._df_uni.tweet:
             if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
                 print(tweet.split())
-                only_url = True
                 break
-        self.assertFalse(only_url)

+import re
 import unittest
 import pandas as pd
         to_date = "2022-07-31"
         user = 'jimmieakesson'
         user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM', 'dadgostarnooshi']
+        sc1 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
+        sc2 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
+        sc3 = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
+        search_string = 'miljö'
+        cls._df_uni = sc1.scrape_by_user(user)
+        cls._df_poly = sc2.scrape_by_several_users(user_list)
+        cls._df_by_string = sc3.scrape_by_string(search_string)
         nbr_of_cols = 9
     def setUp(self):
         self.user = 'jimmieakesson'
         self.user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
                      'dadgostarnooshi']
+        self.search_string  = 'miljö'
         # self.sc = TwitterScraper(from_date=from_date, to_date=to_date, num_tweets=num_tweets)
         nbr_of_cols = 9
     # Checks that the returned datatype is pandas DataFrame
     def test_correct_type(self):
         print('Checking type...')
+        self.assertEqual(type(self._df_uni), type(pd.DataFrame()))
+        self.assertEqual(type(self._df_poly), type(pd.DataFrame()))
+        self.assertEqual(type(self._df_by_string), type(pd.DataFrame()))
     # Checks that we get the correct number of tweets
+    # OBS, FOR NOW IT ONLY CHECKS THAT WE DON'T OVER SAMPLE
     def test_correct_nbr_tweets(self):
         print('Checking number of tweets...')
+        self.assertTrue(self._df_uni.shape[0] <= self.num_tweets)
+        self.assertTrue(self._df_poly.shape[0] <= self.num_tweets)
+        self.assertTrue(self._df_by_string.shape[0] <= self.num_tweets)
     # Checks that all dates are between the start date and the end date
     def test_dates(self):
         print('Checking dates...')
         d_start = datetime.fromisoformat(self.from_date)
         d_end = datetime.fromisoformat(self.to_date)
+        correct_date_uni = True
+        correct_date_poly = True
+        correct_date_by_string = True
         for date in self._df_uni.date:
             d = datetime.fromisoformat(date)
             if not (d >= d_start and d <= d_end):
+                correct_date_uni = False
+                break
+        for date in self._df_poly.date:
+            d = datetime.fromisoformat(date)
+            if not (d >= d_start and d <= d_end):
+                correct_date_poly = False
+                break
+        for date in self._df_by_string.date:
+            d = datetime.fromisoformat(date)
+            if not (d >= d_start and d <= d_end):
+                correct_date_by_string = False
                 break
+        self.assertTrue(correct_date_uni)
+        self.assertTrue(correct_date_poly)
+        self.assertTrue(correct_date_by_string)
     # Checks that all tweets are from the correct user
     def test_user(self):
     # Checks if there are tweets that have been sampled several times
     def test_no_doubles(self):
         print('Checking doubles...')
+        id_set_uni = set(self._df_uni.id)
+        id_set_poly = set(self._df_poly.id)
+        id_set_by_string = set(self._df_by_string.id)
+        self.assertTrue(len(id_set_uni) == self._df_uni.shape[0])
+        self.assertTrue(len(id_set_poly) == self._df_poly.shape[0])
+        self.assertTrue(len(id_set_by_string) == self._df_by_string.shape[0])
     # Checks that we have no None entries
     def test_none(self):
         print('Checking Nones...')
         self.assertFalse(any(b == True for b in self._df_uni.isnull()))
+        self.assertFalse(any(b == True for b in self._df_poly.isnull()))
+        self.assertFalse(any(b == True for b in self._df_by_string.isnull()))
     def test_no_url_tweets(self):
         print('Checking url tweets...')
+        only_url_uni = False
+        only_url_poly = False
+        only_url_by_string = False
         for tweet in self._df_uni.tweet:
             if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
                 print(tweet.split())
+                only_url_uni = True
+                break
+        for tweet in self._df_poly.tweet:
+            if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
+                print(tweet.split())
+                only_url_poly = True
+                break
+        for tweet in self._df_by_string.tweet:
+            if len(tweet.split()) == 1 and tweet.split()[0].startswith('https'):
+                print(tweet.split())
+                only_url_by_string = True
+                break
+        self.assertFalse(only_url_uni)
+        self.assertFalse(only_url_poly)
+        self.assertFalse(only_url_by_string)
+    def test_many_users(self):
+        correct_users = True
+        for user in self._df_uni.username:
+            if user not in self.user_list:
+                correct_users = False
+                break
+        self.assertTrue(correct_users)
+    def test_many_user_ids(self):
+        correct_ids = True
+        user_dict = {}
+        for i in range(self._df_uni.shape[0]):
+            if self._df_uni['username'][i] not in user_dict:
+                user_dict[self._df_uni['username'][i]] = self._df_uni['user_id'][i]
+            if self._df_uni['user_id'][i] != user_dict[self._df_uni['username'][i]]:
+                correct_ids = False
                 break
+        self.assertTrue(correct_ids)
+    def test_string_search(self):
+        correct_search = True
+        search = re.sub('ö', 'ø', self.search_string)
+        search = re.sub('ä', 'æ', search)
+        search_strings = [self.search_string, search]
+        for tweet in self._df_by_string.tweet:
+            if all(search not in tweet.lower() for search in search_strings):
+                correct_search = False
+        self.assertTrue(correct_search)

twitterscraper/TwitterScraper.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import twint
 from datetime import date
@@ -20,7 +21,7 @@ class TwitterScraper(object):
         self.num_tweets = num_tweets
         self.conf = twint.Config()
-    def scrape_by_user(self, _user):
         """This method uses twint to extract tweets  based on username"""
         self.conf.Search = "from:@" + _user  # is the search configuration is given in this format it searches after
         # user_names.
@@ -46,13 +47,14 @@ class TwitterScraper(object):
         self.conf.Search = _string  # this tells twint configuration to search for string
         return self.__get_tweets__from_twint__()
     def scrape_by_user_and_string(self, _user: str, _string: str):
-        """This method uses twint to extract tweets brased on string and username"""
         self.conf.Username = _user
         self.conf.Search = _string
         return self.__get_tweets__from_twint__()
-    # TODO: make method static
     def get_only_tweets(self, tweet_and_replies_info):
         tweet_and_replies = tweet_and_replies_info["tweet"]
         """
@@ -89,12 +91,16 @@ class TwitterScraper(object):
         self.conf.Limit = self.num_tweets  # specifies how many tweet should be scraped
         self.conf.Since = self.from_date
         self.conf.Until = self.to_date
-        self.conf.Hide_output = True  # Hides the output. If set to False it will prints tweets in the terminal window.
         twint.run.Search(self.conf)
         tweet_and_replies_inf = twint.output.panda.Tweets_df  # here we say that output souldwe dataframe.
-        tweet_and_replies_inf = tweet_and_replies_inf[
-            ["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
         return tweet_and_replies_inf
     # def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"):  # If the type of ite date input
     # is not string it generates exception print("[!] Please make sure the date is a string in this format
     # \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
@@ -105,9 +111,18 @@ class TwitterScraper(object):
         return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
                                                                                 self.num_tweets)
 if __name__ == "__main__":
      sc = TwitterScraper(from_date="2022-05-01", to_date="2022-07-31", num_tweets=40)
-     dc = sc.scrape_by_user("jimmieakesson")
-     print(dc.head())
-     print(dc.shape)

+import pandas as pd
 import twint
 from datetime import date
         self.num_tweets = num_tweets
         self.conf = twint.Config()
+    def scrape_by_user(self, _user: str):
         """This method uses twint to extract tweets  based on username"""
         self.conf.Search = "from:@" + _user  # is the search configuration is given in this format it searches after
         # user_names.
         self.conf.Search = _string  # this tells twint configuration to search for string
         return self.__get_tweets__from_twint__()
+    # TODO: Possibly include more than one user
     def scrape_by_user_and_string(self, _user: str, _string: str):
+        """This method uses twint to extract tweets based on string and username"""
         self.conf.Username = _user
         self.conf.Search = _string
         return self.__get_tweets__from_twint__()
+    # TODO: make method static (Possibly remove this)
     def get_only_tweets(self, tweet_and_replies_info):
         tweet_and_replies = tweet_and_replies_info["tweet"]
         """
         self.conf.Limit = self.num_tweets  # specifies how many tweet should be scraped
         self.conf.Since = self.from_date
         self.conf.Until = self.to_date
+        self.conf.Hide_output = True  # Hides the output. If set to False it will print tweets in the terminal window.
         twint.run.Search(self.conf)
         tweet_and_replies_inf = twint.output.panda.Tweets_df  # here we say that output souldwe dataframe.
+        if tweet_and_replies_inf.empty:
+            print("No tweet containing the word \"" + self.conf.Search + "\" could be found!")
+        else:
+            tweet_and_replies_inf = tweet_and_replies_inf[
+                     ["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
         return tweet_and_replies_inf
     # def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"):  # If the type of ite date input
     # is not string it generates exception print("[!] Please make sure the date is a string in this format
     # \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
         return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
                                                                                 self.num_tweets)
 if __name__ == "__main__":
+     user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM', 'dadgostarnooshi']
      sc = TwitterScraper(from_date="2022-05-01", to_date="2022-07-31", num_tweets=40)
+     dc1 = sc.scrape_by_user("jimmieakesson")
+     dc2 = sc.scrape_by_user_and_string("jimmieakesson", "pension")
+     dc3 = sc.scrape_by_several_users(user_list)
+     print(dc2)
+     print(dc1.head())
+     print(dc3.head)