File size: 6,578 Bytes
ae34e1d
6718c74
 
 
b24e23b
 
7edfe41
 
 
 
 
 
 
 
fa0430c
7edfe41
ae34e1d
 
 
 
 
 
 
 
 
 
 
 
 
38b2250
ae34e1d
7edfe41
 
 
 
b24e23b
58a2a29
7edfe41
 
 
 
b24e23b
da9055d
 
 
 
 
 
 
 
 
 
 
 
 
7edfe41
 
 
 
 
 
 
 
c6bed98
 
 
 
 
 
7edfe41
 
 
 
58a2a29
7edfe41
 
 
 
 
 
 
 
 
 
 
 
 
 
fa0430c
7edfe41
 
 
c6bed98
 
7edfe41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58a2a29
7edfe41
c6bed98
58a2a29
 
 
 
ae34e1d
7edfe41
58a2a29
7edfe41
 
 
 
 
03f299a
 
 
 
b24e23b
5cf061d
7edfe41
deaf095
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import re
from datetime import date

import twint


class TwitterScraper(object):
    """
    This class is a twitter TwitterScraper called TwitterScraper. It takes the user as input and collects the user's tweets
    from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of tweets 'num_tweets' from today.
    It outputs a dictionary with the tweet unique id and some other information.
    input: user, from_date, to_date, num_tweets
    output: dict
    """

    def __init__(self, from_date="2022-07-01", to_date=str(date.today()), num_tweets=20):
        """
        This method initializes the TwitterScraper class. It takes the user as input and collects the user's tweets
        from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of
        tweets 'num_tweets' from today.
        :param from_date: str (format: YYYY-MM-DD)
        :param to_date: str (format: YYYY-MM-DD)
        :param num_tweets: int (number of tweets to be scraped)
        """
        # Make sure the dates are in the correct format
        assert re.match(r'^\d{4}-\d{2}-\d{2}$', from_date) is not None, "from_date must be in the format YYYY-MM-DD"
        # Make sure to_date is later than from_date
        assert from_date < to_date, "from_date must be earlier than to_date"
        # Make sure num_tweets is a positive integer
        assert 0 < num_tweets <= 60, "num_tweets must be a positive integer and at most 60"

        self.from_date = from_date
        self.to_date = to_date
        self.num_tweets = num_tweets
        self.conf = twint.Config()

    def scrape_by_user(self, _user: str):
        """This method uses twint to extract tweets  based on username"""
        self.conf.Search = "from:@" + _user  # is the search configuration is given in this format it searches after
        # user_names.
        return self.__get_tweets__from_twint__()

    def scrape_by_several_users(self, _users: list):
        """
        This method uses twint to extract tweets based on username. It takes a list of users as input.

        :param _users: list of users
        :return: dataframe
        """
        # TODO: test this method
        self.conf.Search = "from:@" + _users[0]
        for user in _users[1:]:
            self.conf.Search += " OR from:@" + user
        return self.__get_tweets__from_twint__()

    def scrape_by_string(self, _string: str):
        """This method uses twint to extract tweets based on string.
        all extracted tweets have the specified word in _string parameter in it.
        """
        self.conf.Search = _string  # this tells twint configuration to search for string 
        return self.__get_tweets__from_twint__()

    def scrape_by_user_and_string(self, _user: str, _string: str):
        """
        This method uses twint to extract tweets based on string and username. It takes a list of users as input.
        :param _user: str
        :param _string: str
        :return: dataframe
        """
        self.conf.Username = _user
        self.conf.Search = _string
        return self.__get_tweets__from_twint__()

    # TODO: make method static (Possibly remove this)
    def get_only_tweets(self, tweet_and_replies_info):
        tweet_and_replies = tweet_and_replies_info["tweet"]
        """
        This functions input arg is a data frame (the output from scrape methords ) and removes...
         all tweets starting with \"@\" which is indicator of a reply or retweet.
        """
        indx_replies = []
        for i in range(len(tweet_and_replies)):
            if tweet_and_replies[i].startswith("@"):
                indx_replies.append(i)

        tweets_info = tweet_and_replies_info.drop(labels=indx_replies, axis=0)
        # drop removes the columns which its index specified by
        # indx_replies. axis=0  if we want to delete rows.
        # print(len(tweets['tweet']), " of them are Tweets")
        return tweets_info

    def __get_tweets__from_twint__(self):
        """
        __get_tweets_from_twint__
        tweet info is a dataframe with fallowing columns
            Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
            'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
            'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
            'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
            'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
            'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
            'trans_dest']
        we just pick the relevant ones.
        c is a twint.Config() object
        we also configure twint output.
        """
        self.conf.Pandas = True  #
        self.conf.Count = True  #
        self.conf.Limit = self.num_tweets  # specifies how many tweet should be scraped
        self.conf.Since = self.from_date
        self.conf.Until = self.to_date
        self.conf.Hide_output = True  # Hides the output. If set to False it will print tweets in the terminal window.
        twint.run.Search(self.conf)
        tweet_and_replies_inf = twint.output.panda.Tweets_df  # here we say that output is a dataframe
        if tweet_and_replies_inf.empty:
            print("No tweet containing the word \"" + self.conf.Search + "\" could be found!")
        else:
            tweet_and_replies_inf = tweet_and_replies_inf[
                ["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
        return tweet_and_replies_inf

    # def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"):  # If the type of ite date input
    # is not string it generates exception print("[!] Please make sure the date is a string in this format
    # \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
    # "-")))<2: print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ") raise EXCEPTION(
    # "Incorrect date type Exception!")

    def __repr__(self):
        return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
                                                                                self.num_tweets)


if __name__ == "__main__":
    sc = TwitterScraper(from_date="2022-05-01", to_date="2022-07-31", num_tweets=40)
    dc = sc.scrape_by_user("jimmieakesson")
    print(dc.head())
    print(dc.shape)
    print(dc.columns)