Spaces:
Runtime error
Runtime error
File size: 6,164 Bytes
58a2a29 b24e23b 7edfe41 b24e23b 7edfe41 da9055d 7edfe41 b24e23b 58a2a29 7edfe41 b24e23b da9055d 7edfe41 58a2a29 7edfe41 58a2a29 7edfe41 58a2a29 7edfe41 58a2a29 7edfe41 58a2a29 7edfe41 58a2a29 7edfe41 03f299a 7edfe41 58a2a29 7edfe41 58a2a29 b24e23b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import pandas as pd
import twint
from datetime import date
class TwitterScraper(object):
"""
This class is a twitter TwitterScraper called TwitterScraper. It takes the user as input and collects the user's tweets
from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of tweets 'num_tweets' from today.
It outputs a dictionary with the tweet unique id and some other information.
input: user, from_date, to_date, num_tweets
output: dict
"""
def __init__(self, from_date="2022-07-01", to_date=str(date.today()), num_tweets=20):
# TODO: add a check to make sure that the dates are in the correct format.
# TODO: add a check to make sure that the number of tweets is a positive number.
# TODO: add a check to make sure that the number of tweets
# is not greater than the number of tweets in the date range.
self.from_date = from_date
self.to_date = to_date
self.num_tweets = num_tweets
self.conf = twint.Config()
def scrape_by_user(self, _user: str):
"""This method uses twint to extract tweets based on username"""
self.conf.Search = "from:@" + _user # is the search configuration is given in this format it searches after
# user_names.
return self.__get_tweets__from_twint__()
def scrape_by_several_users(self, _users: list):
"""
This method uses twint to extract tweets based on username. It takes a list of users as input.
:param _users: list of users
:return: dataframe
"""
# TODO: test this method
self.conf.Search = "from:@" + _users[0]
for user in _users[1:]:
self.conf.Search += " OR from:@" + user
return self.__get_tweets__from_twint__()
def scrape_by_string(self, _string: str):
"""This method uses twint to extract tweets based on string.
all extracted tweets have the specified word in _string parameter in it.
"""
self.conf.Search = _string # this tells twint configuration to search for string
return self.__get_tweets__from_twint__()
# TODO: Possibly include more than one user
def scrape_by_user_and_string(self, _user: str, _string: str):
"""This method uses twint to extract tweets based on string and username"""
self.conf.Username = _user
self.conf.Search = _string
return self.__get_tweets__from_twint__()
# TODO: make method static (Possibly remove this)
def get_only_tweets(self, tweet_and_replies_info):
tweet_and_replies = tweet_and_replies_info["tweet"]
"""
This functions input arg is a data frame (the output from scrape methords ) and removes...
all tweets starting with \"@\" which is indicator of a reply or retweet.
"""
indx_replies = []
for i in range(len(tweet_and_replies)):
if tweet_and_replies[i].startswith("@"):
indx_replies.append(i)
tweets_info = tweet_and_replies_info.drop(labels=indx_replies, axis=0)
# drop removes the columns which its index specified by
# indx_replies. axis=0 if we want to delete rows.
#print(len(tweets['tweet']), " of them are Tweets")
return tweets_info
def __get_tweets__from_twint__(self):
""" __get_tweets_from_twint__
tweet info is a dataframe with fallowing columns
Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
'trans_dest']
we just pick the relevant ones.
c is a twint.Config() object
we also configure twint output.
"""
self.conf.Pandas = True #
self.conf.Count = True #
self.conf.Limit = self.num_tweets # specifies how many tweet should be scraped
self.conf.Since = self.from_date
self.conf.Until = self.to_date
self.conf.Hide_output = True # Hides the output. If set to False it will print tweets in the terminal window.
twint.run.Search(self.conf)
tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output souldwe dataframe.
if tweet_and_replies_inf.empty:
print("No tweet containing the word \"" + self.conf.Search + "\" could be found!")
else:
tweet_and_replies_inf = tweet_and_replies_inf[
["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
return tweet_and_replies_inf
# def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
# is not string it generates exception print("[!] Please make sure the date is a string in this format
# \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
# "-")))<2: print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ") raise EXCEPTION(
# "Incorrect date type Exception!")
def __repr__(self):
return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
self.num_tweets)
if __name__ == "__main__":
user_list = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM', 'dadgostarnooshi']
sc = TwitterScraper(from_date="2022-05-01", to_date="2022-07-31", num_tweets=40)
dc1 = sc.scrape_by_user("jimmieakesson")
dc2 = sc.scrape_by_user_and_string("jimmieakesson", "pension")
dc3 = sc.scrape_by_several_users(user_list)
print(dc2)
print(dc1.head())
print(dc3.head)
|