Spaces:
Runtime error
Runtime error
import pandas as pd | |
import twint | |
from datetime import date | |
import re | |
class TwitterScraper(object): | |
""" | |
This class is a twitter TwitterScraper called TwitterScraper. It takes the user as input and collects the user's tweets | |
from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of tweets 'num_tweets' from today. | |
It outputs a dictionary with the tweet unique id and some other information. | |
input: user, from_date, to_date, num_tweets | |
output: dict | |
""" | |
def __init__(self, from_date="2022-07-01", to_date=str(date.today()), num_tweets=20): | |
""" | |
This method initializes the TwitterScraper class. It takes the user as input and collects the user's tweets | |
from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of | |
tweets 'num_tweets' from today. | |
:param from_date: str (format: YYYY-MM-DD) | |
:param to_date: str (format: YYYY-MM-DD) | |
:param num_tweets: int (number of tweets to be scraped) | |
""" | |
# Make sure the dates are in the correct format | |
assert re.match(r'^\d{4}-\d{2}-\d{2}$', from_date) is not None, "from_date must be in the format YYYY-MM-DD" | |
# Make sure to_date is later than from_date | |
assert from_date < to_date, "from_date must be earlier than to_date" | |
# Make sure num_tweets is a positive integer | |
assert 0 < num_tweets <= 20, "num_tweets must be a positive integer and at most 20" | |
self.from_date = from_date | |
self.to_date = to_date | |
self.num_tweets = num_tweets | |
self.conf = twint.Config() | |
def scrape_by_user(self, _user: str): | |
"""This method uses twint to extract tweets based on username""" | |
self.conf.Search = "from:@" + _user # is the search configuration is given in this format it searches after | |
# user_names. | |
return self.__get_tweets__from_twint__() | |
def scrape_by_several_users(self, _users: list): | |
""" | |
This method uses twint to extract tweets based on username. It takes a list of users as input. | |
:param _users: list of users | |
:return: dataframe | |
""" | |
# TODO: test this method | |
self.conf.Search = "from:@" + _users[0] | |
for user in _users[1:]: | |
self.conf.Search += " OR from:@" + user | |
return self.__get_tweets__from_twint__() | |
def scrape_by_string(self, _string: str): | |
"""This method uses twint to extract tweets based on string. | |
all extracted tweets have the specified word in _string parameter in it. | |
""" | |
self.conf.Search = _string # this tells twint configuration to search for string | |
return self.__get_tweets__from_twint__() | |
def scrape_by_user_and_string(self, _user: str, _string: str): | |
""" | |
This method uses twint to extract tweets based on string and username. It takes a list of users as input. | |
:param _user: str | |
:param _string: str | |
:return: dataframe | |
""" | |
self.conf.Username = _user | |
self.conf.Search = _string | |
return self.__get_tweets__from_twint__() | |
# TODO: make method static (Possibly remove this) | |
def get_only_tweets(self, tweet_and_replies_info): | |
tweet_and_replies = tweet_and_replies_info["tweet"] | |
""" | |
This functions input arg is a data frame (the output from scrape methords ) and removes... | |
all tweets starting with \"@\" which is indicator of a reply or retweet. | |
""" | |
indx_replies = [] | |
for i in range(len(tweet_and_replies)): | |
if tweet_and_replies[i].startswith("@"): | |
indx_replies.append(i) | |
tweets_info = tweet_and_replies_info.drop(labels=indx_replies, axis=0) | |
# drop removes the columns which its index specified by | |
# indx_replies. axis=0 if we want to delete rows. | |
# print(len(tweets['tweet']), " of them are Tweets") | |
return tweets_info | |
def __get_tweets__from_twint__(self): | |
""" | |
__get_tweets_from_twint__ | |
tweet info is a dataframe with fallowing columns | |
Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place', | |
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str', | |
'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video', | |
'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url', | |
'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt', | |
'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src', | |
'trans_dest'] | |
we just pick the relevant ones. | |
c is a twint.Config() object | |
we also configure twint output. | |
""" | |
self.conf.Pandas = True # | |
self.conf.Count = True # | |
self.conf.Limit = self.num_tweets # specifies how many tweet should be scraped | |
self.conf.Since = self.from_date | |
self.conf.Until = self.to_date | |
self.conf.Hide_output = True # Hides the output. If set to False it will print tweets in the terminal window. | |
twint.run.Search(self.conf) | |
tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output is a dataframe | |
if tweet_and_replies_inf.empty: | |
print("No tweet containing the word \"" + self.conf.Search + "\" could be found!") | |
else: | |
tweet_and_replies_inf = tweet_and_replies_inf[ | |
["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']] | |
return tweet_and_replies_inf | |
# def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input | |
# is not string it generates exception print("[!] Please make sure the date is a string in this format | |
# \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split( | |
# "-")))<2: print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ") raise EXCEPTION( | |
# "Incorrect date type Exception!") | |
def __repr__(self): | |
return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date, | |
self.num_tweets) | |
if __name__ == "__main__": | |
sc = TwitterScraper(from_date="2022-05-01", to_date="2022-07-31", num_tweets=40) | |
dc = sc.scrape_by_user("jimmieakesson") | |
print(dc.head()) | |
print(dc.shape) | |
print(dc.columns) | |