Spaces:

politweet-sh
/

politweet

Runtime error

App Files Files Community

Oresti Theodoridis commited on Jul 18, 2022

Commit

dcbfe5a

•

2 Parent(s): e65e077 c6bed98

Merge pull request #64 from Demea9000/63-fix-todos-in-twitterscraper

Browse files

Files changed (2) hide show

textclassifier/TextClassifier.py +1 -7
twitterscraper/TwitterScraper.py +26 -10

textclassifier/TextClassifier.py CHANGED Viewed

@@ -29,21 +29,15 @@ class TextClassifier:
         :param to_date: string of the format 'YYYY-MM-DD'.
         :param num_tweets: integer value of the maximum number of tweets to be scraped.
         """
-        # Make sure to_date is later than from_date
-        assert from_date < to_date, "from_date must be earlier than to_date"
-        # Make sure the dates are in the correct format
-        assert re.match(r'^\d{4}-\d{2}-\d{2}$', from_date) is not None, "from_date must be in the format YYYY-MM-DD"
         # Make sure user_name is not empty
         assert user_name is not None, "user_name cannot be empty"
-        # Make sure num_tweets is a positive integer
-        assert 0 < num_tweets <= 20, "num_tweets must be a positive integer and at most 20"
         self.model_name = model_name
         self.from_date = from_date
         self.to_date = to_date
         self.num_tweets = num_tweets
         self.user_name = user_name
-        self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
         # Assure that scrape_by_user actually gets num_tweets
         # add timer in time-loop and stop after 10 seconds
         start_time = time.time()

         :param to_date: string of the format 'YYYY-MM-DD'.
         :param num_tweets: integer value of the maximum number of tweets to be scraped.
         """
         # Make sure user_name is not empty
         assert user_name is not None, "user_name cannot be empty"
+        self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
         self.model_name = model_name
         self.from_date = from_date
         self.to_date = to_date
         self.num_tweets = num_tweets
         self.user_name = user_name
         # Assure that scrape_by_user actually gets num_tweets
         # add timer in time-loop and stop after 10 seconds
         start_time = time.time()

twitterscraper/TwitterScraper.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import pandas as pd
 import twint
 from datetime import date
 class TwitterScraper(object):
@@ -13,10 +14,21 @@ class TwitterScraper(object):
     """
     def __init__(self, from_date="2022-07-01", to_date=str(date.today()), num_tweets=20):
-        # TODO: add a check to make sure that the dates are in the correct format.
-        #  TODO: add a check to make sure that the number of tweets is a positive number.
-        #  TODO: add a check to make sure that the number of tweets
-        #   is not greater than the number of tweets in the date range.
         self.from_date = from_date
         self.to_date = to_date
         self.num_tweets = num_tweets
@@ -48,9 +60,13 @@ class TwitterScraper(object):
         self.conf.Search = _string  # this tells twint configuration to search for string
         return self.__get_tweets__from_twint__()
-    # TODO: Possibly include more than one user
     def scrape_by_user_and_string(self, _user: str, _string: str):
-        """This method uses twint to extract tweets based on string and username"""
         self.conf.Username = _user
         self.conf.Search = _string
         return self.__get_tweets__from_twint__()
@@ -74,7 +90,8 @@ class TwitterScraper(object):
         return tweets_info
     def __get_tweets__from_twint__(self):
-        """ __get_tweets_from_twint__
         tweet info is a dataframe with fallowing columns
             Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
             'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
@@ -94,12 +111,12 @@ class TwitterScraper(object):
         self.conf.Until = self.to_date
         self.conf.Hide_output = True  # Hides the output. If set to False it will print tweets in the terminal window.
         twint.run.Search(self.conf)
-        tweet_and_replies_inf = twint.output.panda.Tweets_df  # here we say that output souldwe dataframe.
         if tweet_and_replies_inf.empty:
             print("No tweet containing the word \"" + self.conf.Search + "\" could be found!")
         else:
             tweet_and_replies_inf = tweet_and_replies_inf[
-                     ["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
         return tweet_and_replies_inf
     # def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"):  # If the type of ite date input
@@ -111,7 +128,6 @@ class TwitterScraper(object):
     def __repr__(self):
         return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
                                                                                 self.num_tweets)
 if __name__ == "__main__":

 import pandas as pd
 import twint
 from datetime import date
+import re
 class TwitterScraper(object):
     """
     def __init__(self, from_date="2022-07-01", to_date=str(date.today()), num_tweets=20):
+        """
+        This method initializes the TwitterScraper class. It takes the user as input and collects the user's tweets
+        from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of
+        tweets 'num_tweets' from today.
+        :param from_date: str (format: YYYY-MM-DD)
+        :param to_date: str (format: YYYY-MM-DD)
+        :param num_tweets: int (number of tweets to be scraped)
+        """
+        # Make sure the dates are in the correct format
+        assert re.match(r'^\d{4}-\d{2}-\d{2}$', from_date) is not None, "from_date must be in the format YYYY-MM-DD"
+        # Make sure to_date is later than from_date
+        assert from_date < to_date, "from_date must be earlier than to_date"
+        # Make sure num_tweets is a positive integer
+        assert 0 < num_tweets <= 20, "num_tweets must be a positive integer and at most 20"
         self.from_date = from_date
         self.to_date = to_date
         self.num_tweets = num_tweets
         self.conf.Search = _string  # this tells twint configuration to search for string
         return self.__get_tweets__from_twint__()
     def scrape_by_user_and_string(self, _user: str, _string: str):
+        """
+        This method uses twint to extract tweets based on string and username. It takes a list of users as input.
+        :param _user: str
+        :param _string: str
+        :return: dataframe
+        """
         self.conf.Username = _user
         self.conf.Search = _string
         return self.__get_tweets__from_twint__()
         return tweets_info
     def __get_tweets__from_twint__(self):
+        """
+        __get_tweets_from_twint__
         tweet info is a dataframe with fallowing columns
             Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
             'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
         self.conf.Until = self.to_date
         self.conf.Hide_output = True  # Hides the output. If set to False it will print tweets in the terminal window.
         twint.run.Search(self.conf)
+        tweet_and_replies_inf = twint.output.panda.Tweets_df  # here we say that output is a dataframe
         if tweet_and_replies_inf.empty:
             print("No tweet containing the word \"" + self.conf.Search + "\" could be found!")
         else:
             tweet_and_replies_inf = tweet_and_replies_inf[
+                ["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
         return tweet_and_replies_inf
     # def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"):  # If the type of ite date input
     def __repr__(self):
         return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
                                                                                 self.num_tweets)
 if __name__ == "__main__":