Spaces:
Runtime error
Runtime error
Merge pull request #64 from Demea9000/63-fix-todos-in-twitterscraper
Browse files
textclassifier/TextClassifier.py
CHANGED
@@ -29,21 +29,15 @@ class TextClassifier:
|
|
29 |
:param to_date: string of the format 'YYYY-MM-DD'.
|
30 |
:param num_tweets: integer value of the maximum number of tweets to be scraped.
|
31 |
"""
|
32 |
-
# Make sure to_date is later than from_date
|
33 |
-
assert from_date < to_date, "from_date must be earlier than to_date"
|
34 |
-
# Make sure the dates are in the correct format
|
35 |
-
assert re.match(r'^\d{4}-\d{2}-\d{2}$', from_date) is not None, "from_date must be in the format YYYY-MM-DD"
|
36 |
# Make sure user_name is not empty
|
37 |
assert user_name is not None, "user_name cannot be empty"
|
38 |
-
# Make sure num_tweets is a positive integer
|
39 |
-
assert 0 < num_tweets <= 20, "num_tweets must be a positive integer and at most 20"
|
40 |
|
|
|
41 |
self.model_name = model_name
|
42 |
self.from_date = from_date
|
43 |
self.to_date = to_date
|
44 |
self.num_tweets = num_tweets
|
45 |
self.user_name = user_name
|
46 |
-
self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
|
47 |
# Assure that scrape_by_user actually gets num_tweets
|
48 |
# add timer in time-loop and stop after 10 seconds
|
49 |
start_time = time.time()
|
|
|
29 |
:param to_date: string of the format 'YYYY-MM-DD'.
|
30 |
:param num_tweets: integer value of the maximum number of tweets to be scraped.
|
31 |
"""
|
|
|
|
|
|
|
|
|
32 |
# Make sure user_name is not empty
|
33 |
assert user_name is not None, "user_name cannot be empty"
|
|
|
|
|
34 |
|
35 |
+
self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
|
36 |
self.model_name = model_name
|
37 |
self.from_date = from_date
|
38 |
self.to_date = to_date
|
39 |
self.num_tweets = num_tweets
|
40 |
self.user_name = user_name
|
|
|
41 |
# Assure that scrape_by_user actually gets num_tweets
|
42 |
# add timer in time-loop and stop after 10 seconds
|
43 |
start_time = time.time()
|
twitterscraper/TwitterScraper.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import pandas as pd
|
2 |
import twint
|
3 |
from datetime import date
|
|
|
4 |
|
5 |
|
6 |
class TwitterScraper(object):
|
@@ -13,10 +14,21 @@ class TwitterScraper(object):
|
|
13 |
"""
|
14 |
|
15 |
def __init__(self, from_date="2022-07-01", to_date=str(date.today()), num_tweets=20):
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
self.from_date = from_date
|
21 |
self.to_date = to_date
|
22 |
self.num_tweets = num_tweets
|
@@ -48,9 +60,13 @@ class TwitterScraper(object):
|
|
48 |
self.conf.Search = _string # this tells twint configuration to search for string
|
49 |
return self.__get_tweets__from_twint__()
|
50 |
|
51 |
-
# TODO: Possibly include more than one user
|
52 |
def scrape_by_user_and_string(self, _user: str, _string: str):
|
53 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
54 |
self.conf.Username = _user
|
55 |
self.conf.Search = _string
|
56 |
return self.__get_tweets__from_twint__()
|
@@ -74,7 +90,8 @@ class TwitterScraper(object):
|
|
74 |
return tweets_info
|
75 |
|
76 |
def __get_tweets__from_twint__(self):
|
77 |
-
"""
|
|
|
78 |
tweet info is a dataframe with fallowing columns
|
79 |
Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
|
80 |
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
|
@@ -94,12 +111,12 @@ class TwitterScraper(object):
|
|
94 |
self.conf.Until = self.to_date
|
95 |
self.conf.Hide_output = True # Hides the output. If set to False it will print tweets in the terminal window.
|
96 |
twint.run.Search(self.conf)
|
97 |
-
tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output
|
98 |
if tweet_and_replies_inf.empty:
|
99 |
print("No tweet containing the word \"" + self.conf.Search + "\" could be found!")
|
100 |
else:
|
101 |
tweet_and_replies_inf = tweet_and_replies_inf[
|
102 |
-
|
103 |
return tweet_and_replies_inf
|
104 |
|
105 |
# def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
|
@@ -111,7 +128,6 @@ class TwitterScraper(object):
|
|
111 |
def __repr__(self):
|
112 |
return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
|
113 |
self.num_tweets)
|
114 |
-
|
115 |
|
116 |
|
117 |
if __name__ == "__main__":
|
|
|
1 |
import pandas as pd
|
2 |
import twint
|
3 |
from datetime import date
|
4 |
+
import re
|
5 |
|
6 |
|
7 |
class TwitterScraper(object):
|
|
|
14 |
"""
|
15 |
|
16 |
def __init__(self, from_date="2022-07-01", to_date=str(date.today()), num_tweets=20):
|
17 |
+
"""
|
18 |
+
This method initializes the TwitterScraper class. It takes the user as input and collects the user's tweets
|
19 |
+
from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of
|
20 |
+
tweets 'num_tweets' from today.
|
21 |
+
:param from_date: str (format: YYYY-MM-DD)
|
22 |
+
:param to_date: str (format: YYYY-MM-DD)
|
23 |
+
:param num_tweets: int (number of tweets to be scraped)
|
24 |
+
"""
|
25 |
+
# Make sure the dates are in the correct format
|
26 |
+
assert re.match(r'^\d{4}-\d{2}-\d{2}$', from_date) is not None, "from_date must be in the format YYYY-MM-DD"
|
27 |
+
# Make sure to_date is later than from_date
|
28 |
+
assert from_date < to_date, "from_date must be earlier than to_date"
|
29 |
+
# Make sure num_tweets is a positive integer
|
30 |
+
assert 0 < num_tweets <= 20, "num_tweets must be a positive integer and at most 20"
|
31 |
+
|
32 |
self.from_date = from_date
|
33 |
self.to_date = to_date
|
34 |
self.num_tweets = num_tweets
|
|
|
60 |
self.conf.Search = _string # this tells twint configuration to search for string
|
61 |
return self.__get_tweets__from_twint__()
|
62 |
|
|
|
63 |
def scrape_by_user_and_string(self, _user: str, _string: str):
|
64 |
+
"""
|
65 |
+
This method uses twint to extract tweets based on string and username. It takes a list of users as input.
|
66 |
+
:param _user: str
|
67 |
+
:param _string: str
|
68 |
+
:return: dataframe
|
69 |
+
"""
|
70 |
self.conf.Username = _user
|
71 |
self.conf.Search = _string
|
72 |
return self.__get_tweets__from_twint__()
|
|
|
90 |
return tweets_info
|
91 |
|
92 |
def __get_tweets__from_twint__(self):
|
93 |
+
"""
|
94 |
+
__get_tweets_from_twint__
|
95 |
tweet info is a dataframe with fallowing columns
|
96 |
Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
|
97 |
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
|
|
|
111 |
self.conf.Until = self.to_date
|
112 |
self.conf.Hide_output = True # Hides the output. If set to False it will print tweets in the terminal window.
|
113 |
twint.run.Search(self.conf)
|
114 |
+
tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output is a dataframe
|
115 |
if tweet_and_replies_inf.empty:
|
116 |
print("No tweet containing the word \"" + self.conf.Search + "\" could be found!")
|
117 |
else:
|
118 |
tweet_and_replies_inf = tweet_and_replies_inf[
|
119 |
+
["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
|
120 |
return tweet_and_replies_inf
|
121 |
|
122 |
# def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
|
|
|
128 |
def __repr__(self):
|
129 |
return "TwitterScraper(from_date={}, to_date={}, num_tweets={})".format(self.from_date, self.to_date,
|
130 |
self.num_tweets)
|
|
|
131 |
|
132 |
|
133 |
if __name__ == "__main__":
|