File size: 5,990 Bytes
58a2a29
b24e23b
7edfe41
b24e23b
 
7edfe41
 
 
 
 
 
 
 
fa0430c
7edfe41
da9055d
 
 
 
7edfe41
 
 
 
b24e23b
58a2a29
7edfe41
 
 
 
b24e23b
da9055d
 
 
 
 
 
 
 
 
 
 
 
 
7edfe41
 
 
 
 
 
 
58a2a29
7edfe41
58a2a29
7edfe41
 
 
 
58a2a29
7edfe41
 
 
 
 
 
 
 
 
 
 
 
 
 
fa0430c
7edfe41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58a2a29
7edfe41
 
58a2a29
 
 
 
 
7edfe41
58a2a29
7edfe41
 
 
 
 
03f299a
 
 
 
e92325c
b24e23b
5cf061d
7edfe41
deaf095
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pandas as pd
import twint
from datetime import date


class TwitterScraper(object):
    """
    This class is a twitter TwitterScraper called TwitterScraper. It takes the user as input and collects the user's tweets
    from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of tweets 'num_tweets' from today.
    It outputs a dictionary with the tweet unique id and some other information.
    input: user, from_date, to_date, num_tweets
    output: dict
    """

    def __init__(self, from_date="2022-07-01", to_date=str(date.today()), num_tweets=20):
        # TODO: add a check to make sure that the dates are in the correct format.
        #  TODO: add a check to make sure that the number of tweets is a positive number.
        #  TODO: add a check to make sure that the number of tweets
        #   is not greater than the number of tweets in the date range.
        self.from_date = from_date
        self.to_date = to_date
        self.num_tweets = num_tweets
        self.conf = twint.Config()

    def scrape_by_user(self, _user: str):
        """This method uses twint to extract tweets  based on username"""
        self.conf.Search = "from:@" + _user  # is the search configuration is given in this format it searches after
        # user_names.
        return self.__get_tweets__from_twint__()

    def scrape_by_several_users(self, _users: list):
        """
        This method uses twint to extract tweets based on username. It takes a list of users as input.

        :param _users: list of users
        :return: dataframe
        """
        # TODO: test this method
        self.conf.Search = "from:@" + _users[0]
        for user in _users[1:]:
            self.conf.Search += " OR from:@" + user
        return self.__get_tweets__from_twint__()

    def scrape_by_string(self, _string: str):
        """This method uses twint to extract tweets based on string.
        all extracted tweets have the specified word in _string parameter in it.
        """
        self.conf.Search = _string  # this tells twint configuration to search for string 
        return self.__get_tweets__from_twint__()

    # TODO: Possibly include more than one user
    def scrape_by_user_and_string(self, _user: str, _string: str):
        """This method uses twint to extract tweets based on string and username"""
        self.conf.Username = _user
        self.conf.Search = _string
        return self.__get_tweets__from_twint__()

    # TODO: make method static (Possibly remove this)
    def get_only_tweets(self, tweet_and_replies_info):
        tweet_and_replies = tweet_and_replies_info["tweet"]
        """
        This functions input arg is a data frame (the output from scrape methords ) and removes...
         all tweets starting with \"@\" which is indicator of a reply or retweet.
        """
        indx_replies = []
        for i in range(len(tweet_and_replies)):
            if tweet_and_replies[i].startswith("@"):
                indx_replies.append(i)

        tweets_info = tweet_and_replies_info.drop(labels=indx_replies, axis=0)
        # drop removes the columns which its index specified by
        # indx_replies. axis=0  if we want to delete rows.
        # print(len(tweets['tweet']), " of them are Tweets")
        return tweets_info

    def __get_tweets__from_twint__(self):
        """ __get_tweets_from_twint__
        tweet info is a dataframe with fallowing columns
            Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
            'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
            'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
            'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
            'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
            'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
            'trans_dest']
        we just pick the relevant ones.
        c is a twint.Config() object
        we also configure twint output.
        """
        self.conf.Pandas = True  #
        self.conf.Count = True  #
        self.conf.Limit = self.num_tweets  # specifies how many tweet should be scraped
        self.conf.Since = self.from_date
        self.conf.Until = self.to_date
        self.conf.Hide_output = True  # Hides the output. If set to False it will print tweets in the terminal window.
        twint.run.Search(self.conf)
        tweet_and_replies_inf = twint.output.panda.Tweets_df  # here we say that output souldwe dataframe.
        if tweet_and_replies_inf.empty:
            print("No tweet containing the word \"" + self.conf.Search + "\" could be found!")
        else:
            tweet_and_replies_inf = tweet_and_replies_inf[
                     ["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
        return tweet_and_replies_inf

    # def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"):  # If the type of ite date input
    # is not string it generates exception print("[!] Please make sure the date is a string in this format
    # \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
    # "-")))<2: print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ") raise EXCEPTION(
    # "Incorrect date type Exception!")

    def __repr__(self):
        return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
                                                                                self.num_tweets)
                                                                                


if __name__ == "__main__":
    sc = TwitterScraper(from_date="2022-05-01", to_date="2022-07-31", num_tweets=40)
    dc = sc.scrape_by_user("jimmieakesson")
    print(dc.head())
    print(dc.shape)
    print(dc.columns)