Spaces:
Runtime error
Runtime error
File size: 6,578 Bytes
ae34e1d 6718c74 b24e23b 7edfe41 fa0430c 7edfe41 ae34e1d 7edfe41 b24e23b 58a2a29 7edfe41 b24e23b da9055d 7edfe41 c6bed98 7edfe41 58a2a29 7edfe41 fa0430c 7edfe41 c6bed98 7edfe41 58a2a29 7edfe41 c6bed98 58a2a29 ae34e1d 7edfe41 58a2a29 7edfe41 03f299a b24e23b 5cf061d 7edfe41 deaf095 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import re
from datetime import date
import twint
class TwitterScraper(object):
"""
This class is a twitter TwitterScraper called TwitterScraper. It takes the user as input and collects the user's tweets
from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of tweets 'num_tweets' from today.
It outputs a dictionary with the tweet unique id and some other information.
input: user, from_date, to_date, num_tweets
output: dict
"""
def __init__(self, from_date="2022-07-01", to_date=str(date.today()), num_tweets=20):
"""
This method initializes the TwitterScraper class. It takes the user as input and collects the user's tweets
from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of
tweets 'num_tweets' from today.
:param from_date: str (format: YYYY-MM-DD)
:param to_date: str (format: YYYY-MM-DD)
:param num_tweets: int (number of tweets to be scraped)
"""
# Make sure the dates are in the correct format
assert re.match(r'^\d{4}-\d{2}-\d{2}$', from_date) is not None, "from_date must be in the format YYYY-MM-DD"
# Make sure to_date is later than from_date
assert from_date < to_date, "from_date must be earlier than to_date"
# Make sure num_tweets is a positive integer
assert 0 < num_tweets <= 20, "num_tweets must be a positive integer and at most 20"
self.from_date = from_date
self.to_date = to_date
self.num_tweets = num_tweets
self.conf = twint.Config()
def scrape_by_user(self, _user: str):
"""This method uses twint to extract tweets based on username"""
self.conf.Search = "from:@" + _user # is the search configuration is given in this format it searches after
# user_names.
return self.__get_tweets__from_twint__()
def scrape_by_several_users(self, _users: list):
"""
This method uses twint to extract tweets based on username. It takes a list of users as input.
:param _users: list of users
:return: dataframe
"""
# TODO: test this method
self.conf.Search = "from:@" + _users[0]
for user in _users[1:]:
self.conf.Search += " OR from:@" + user
return self.__get_tweets__from_twint__()
def scrape_by_string(self, _string: str):
"""This method uses twint to extract tweets based on string.
all extracted tweets have the specified word in _string parameter in it.
"""
self.conf.Search = _string # this tells twint configuration to search for string
return self.__get_tweets__from_twint__()
def scrape_by_user_and_string(self, _user: str, _string: str):
"""
This method uses twint to extract tweets based on string and username. It takes a list of users as input.
:param _user: str
:param _string: str
:return: dataframe
"""
self.conf.Username = _user
self.conf.Search = _string
return self.__get_tweets__from_twint__()
# TODO: make method static (Possibly remove this)
def get_only_tweets(self, tweet_and_replies_info):
tweet_and_replies = tweet_and_replies_info["tweet"]
"""
This functions input arg is a data frame (the output from scrape methords ) and removes...
all tweets starting with \"@\" which is indicator of a reply or retweet.
"""
indx_replies = []
for i in range(len(tweet_and_replies)):
if tweet_and_replies[i].startswith("@"):
indx_replies.append(i)
tweets_info = tweet_and_replies_info.drop(labels=indx_replies, axis=0)
# drop removes the columns which its index specified by
# indx_replies. axis=0 if we want to delete rows.
# print(len(tweets['tweet']), " of them are Tweets")
return tweets_info
def __get_tweets__from_twint__(self):
"""
__get_tweets_from_twint__
tweet info is a dataframe with fallowing columns
Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
'trans_dest']
we just pick the relevant ones.
c is a twint.Config() object
we also configure twint output.
"""
self.conf.Pandas = True #
self.conf.Count = True #
self.conf.Limit = self.num_tweets # specifies how many tweet should be scraped
self.conf.Since = self.from_date
self.conf.Until = self.to_date
self.conf.Hide_output = True # Hides the output. If set to False it will print tweets in the terminal window.
twint.run.Search(self.conf)
tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output is a dataframe
if tweet_and_replies_inf.empty:
print("No tweet containing the word \"" + self.conf.Search + "\" could be found!")
else:
tweet_and_replies_inf = tweet_and_replies_inf[
["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
return tweet_and_replies_inf
# def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
# is not string it generates exception print("[!] Please make sure the date is a string in this format
# \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
# "-")))<2: print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ") raise EXCEPTION(
# "Incorrect date type Exception!")
def __repr__(self):
return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
self.num_tweets)
if __name__ == "__main__":
sc = TwitterScraper(from_date="2022-05-01", to_date="2022-07-31", num_tweets=40)
dc = sc.scrape_by_user("jimmieakesson")
print(dc.head())
print(dc.shape)
print(dc.columns)
|