Spaces:
Runtime error
Runtime error
Merge pull request #32 from Demea9000/26-flytta-alla-notebooks-i-en-directory-notebooks
Browse files
.idea/misc.xml
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<project version="4">
|
3 |
-
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.
|
4 |
</project>
|
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<project version="4">
|
3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (politweet)" project-jdk-type="Python SDK" />
|
4 |
</project>
|
.idea/politweet.iml
CHANGED
@@ -5,7 +5,7 @@
|
|
5 |
<excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
|
6 |
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
7 |
</content>
|
8 |
-
<orderEntry type="
|
9 |
<orderEntry type="sourceFolder" forTests="false" />
|
10 |
</component>
|
11 |
</module>
|
|
|
5 |
<excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
|
6 |
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
7 |
</content>
|
8 |
+
<orderEntry type="jdk" jdkName="Python 3.9 (politweet)" jdkType="Python SDK" />
|
9 |
<orderEntry type="sourceFolder" forTests="false" />
|
10 |
</component>
|
11 |
</module>
|
politweet_notebook.ipynb β notebooks/politweet_notebook.ipynb
RENAMED
File without changes
|
notebooks/politweet_testing.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
{twitter-scraper β notebooks}/twitter_scraper.ipynb
RENAMED
File without changes
|
twitter-scraper/TwitterScraper.py
CHANGED
@@ -1,17 +1,91 @@
|
|
1 |
import twint
|
2 |
-
import
|
3 |
|
4 |
-
c = twint.Config()
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
df = pd.read_csv('taylor_swift_tweets.csv')
|
16 |
|
17 |
-
print(df.head())
|
|
|
1 |
import twint
|
2 |
+
from datetime import date
|
3 |
|
|
|
4 |
|
5 |
+
class TwitterScraper(object):
|
6 |
+
"""
|
7 |
+
This class is a twitter TwitterScraper called TwitterScraper. It takes the user as input and collects the user's tweets
|
8 |
+
from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of tweets 'num_tweets' from today.
|
9 |
+
It outputs a dictionary with the tweet unique id and some other information.
|
10 |
+
input: user, from_date, to_date, num_tweets
|
11 |
+
output: dict
|
12 |
+
"""
|
13 |
+
def __init__(self, from_date="2022-07-01", to_date=str(date.today()), num_tweets=20):
|
14 |
+
self.from_date = from_date
|
15 |
+
self.to_date = to_date
|
16 |
+
self.num_tweets = num_tweets
|
17 |
+
self.conf = twint.Config()
|
18 |
|
19 |
+
def scrape_by_user(self, _user):
|
20 |
+
"""This method uses twint to extract tweets based on username"""
|
21 |
+
self.conf.Search = "from:@" + _user # is the search configuration is given in this format it searches after
|
22 |
+
# user_names.
|
23 |
+
return self.__get_tweets__from_twint__()
|
24 |
|
25 |
+
def scrape_by_string(self, _string: str):
|
26 |
+
"""This method uses twint to extract tweets based on string.
|
27 |
+
all extracted tweets have the specified word in _string parameter in it.
|
28 |
+
"""
|
29 |
+
self.conf.Search = _string # this tells twint configuration to search for string
|
30 |
+
return self.__get_tweets__from_twint__()
|
31 |
+
|
32 |
+
def scrape_by_user_and_string(self, _user: str, _string: str):
|
33 |
+
"""This method uses twint to extract tweets brased on string and username"""
|
34 |
+
self.conf.Username = _user
|
35 |
+
self.conf.Search = _string
|
36 |
+
return self.__get_tweets__from_twint__()
|
37 |
+
|
38 |
+
def get_only_tweets(self, tweet_and_replies_info):
|
39 |
+
tweet_and_replies = tweet_and_replies_info["tweet"]
|
40 |
+
"""
|
41 |
+
This functions input arg is a data frame (the output from scrape methords ) and removes...
|
42 |
+
all tweets starting with \"@\" which is indicator of a reply or retweet.
|
43 |
+
"""
|
44 |
+
indx_replies = []
|
45 |
+
for i in range(len(tweet_and_replies)):
|
46 |
+
if tweet_and_replies[i].startswith("@"):
|
47 |
+
indx_replies.append(i)
|
48 |
+
|
49 |
+
tweets_info = tweet_and_replies_info.drop(labels=indx_replies, axis=0)
|
50 |
+
# drop removes the columns which its index specified by
|
51 |
+
# indx_replies. axis=0 if we want to delete rows.
|
52 |
+
#print(len(tweets['tweet']), " of them are Tweets")
|
53 |
+
return tweets_info
|
54 |
+
|
55 |
+
def __get_tweets__from_twint__(self):
|
56 |
+
""" __get_tweets_from_twint__
|
57 |
+
tweet info is a dataframe with fallowing columns
|
58 |
+
Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
|
59 |
+
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
|
60 |
+
'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
|
61 |
+
'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
|
62 |
+
'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
|
63 |
+
'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
|
64 |
+
'trans_dest']
|
65 |
+
we just pick the relevant ones.
|
66 |
+
c is a twint.Config() object
|
67 |
+
we also configure twint output.
|
68 |
+
"""
|
69 |
+
self.conf.Pandas = True #
|
70 |
+
self.conf.Count = True #
|
71 |
+
self.conf.Limit = self.num_tweets # specifies how many tweet should be scraped
|
72 |
+
self.conf.Since = self.from_date
|
73 |
+
self.conf.Until = self.to_date
|
74 |
+
self.conf.Hide_output = True # Hides the output. If set to False it will prints tweets in the terminal window.
|
75 |
+
twint.run.Search(self.conf)
|
76 |
+
tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output souldwe dataframe.
|
77 |
+
tweet_and_replies_inf = tweet_and_replies_inf[
|
78 |
+
["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
|
79 |
+
return tweet_and_replies_inf
|
80 |
+
# def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
|
81 |
+
# is not string it generates exception print("[!] Please make sure the date is a string in this format
|
82 |
+
# \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
|
83 |
+
# "-")))<2: print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ") raise EXCEPTION(
|
84 |
+
# "Incorrect date type Exception!")
|
85 |
+
if __name__ == "__main__":
|
86 |
+
sc = TwitterScraper(from_date="2022-05-01", to_date="2022-07-31", num_tweets=40)
|
87 |
+
dc = sc.scrape_by_user("jimmieakesson")
|
88 |
+
print(dc.head())
|
89 |
+
print(dc.shape)
|
90 |
|
|
|
91 |
|
|
twitter-scraper/scrape.py
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
import twint
|
2 |
-
from datetime import date
|
3 |
-
|
4 |
-
|
5 |
-
class TwitterScraper(object):
|
6 |
-
"""
|
7 |
-
This class is a twitter TwitterScraper called TwitterScraper. It takes the user as input and collects the user's tweets
|
8 |
-
from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of tweets 'num_tweets' from today.
|
9 |
-
It outputs a dictionary with the tweet unique id and some other information.
|
10 |
-
input: user, from_date, to_date, num_tweets
|
11 |
-
output: dict
|
12 |
-
"""
|
13 |
-
def __init__(self, from_date="2006-07-01", to_date=str(date.today()), num_tweets=20):
|
14 |
-
self.from_date = from_date
|
15 |
-
self.to_date = to_date
|
16 |
-
self.num_tweets = num_tweets
|
17 |
-
self.conf = twint.Config()
|
18 |
-
|
19 |
-
def scrape_by_user(self, _user):
|
20 |
-
"""This method uses twint to extract tweets based on username"""
|
21 |
-
self.conf.Search = "from:@" + _user # is the search configuration is given in this format it searches after
|
22 |
-
# user_names.
|
23 |
-
return self.__get_tweets__from_twint__()
|
24 |
-
|
25 |
-
def scrape_by_string(self, _string: str):
|
26 |
-
"""This method uses twint to extract tweets based on string.
|
27 |
-
all extracted tweets have the specified word in _string parameter in it.
|
28 |
-
"""
|
29 |
-
self.conf.Search = _string # this tells twint configuration to search for string
|
30 |
-
return self.__get_tweets__from_twint__()
|
31 |
-
|
32 |
-
def scrape_by_user_and_string(self, _user: str, _string: str):
|
33 |
-
"""This method uses twint to extract tweets brased on string and username"""
|
34 |
-
self.conf.Username = _user
|
35 |
-
self.conf.Search = _string
|
36 |
-
return self.__get_tweets__from_twint__()
|
37 |
-
|
38 |
-
def get_only_tweets(self, tweet_and_replies_info):
|
39 |
-
tweet_and_replies = tweet_and_replies_info["tweet"]
|
40 |
-
"""
|
41 |
-
This functions input arg is a data frame (the output from scrape methords ) and removes...
|
42 |
-
all tweets starting with \"@\" which is indicator of a reply or retweet.
|
43 |
-
"""
|
44 |
-
indx_replies = []
|
45 |
-
for i in range(len(tweet_and_replies)):
|
46 |
-
if tweet_and_replies[i].startswith("@"):
|
47 |
-
indx_replies.append(i)
|
48 |
-
|
49 |
-
tweets_info = tweet_and_replies_info.drop(labels=indx_replies, axis=0)
|
50 |
-
# drop removes the columns which its index specified by
|
51 |
-
# indx_replies. axis=0 if we want to delete rows.
|
52 |
-
#print(len(tweets['tweet']), " of them are Tweets")
|
53 |
-
return tweets_info
|
54 |
-
|
55 |
-
def __get_tweets__from_twint__(self):
|
56 |
-
""" __get_tweets_from_twint__
|
57 |
-
tweet info is a dataframe with fallowing columns
|
58 |
-
Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
|
59 |
-
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
|
60 |
-
'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
|
61 |
-
'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
|
62 |
-
'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
|
63 |
-
'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
|
64 |
-
'trans_dest']
|
65 |
-
we just pick the relevant ones.
|
66 |
-
c is a twint.Config() object
|
67 |
-
we also configure twint output.
|
68 |
-
"""
|
69 |
-
self.conf.Pandas = True #
|
70 |
-
self.conf.Count = True #
|
71 |
-
self.conf.Limit = self.num_tweets # specifies how many tweet should be scraped
|
72 |
-
self.conf.Since = self.from_date
|
73 |
-
self.conf.Until = self.to_date
|
74 |
-
self.conf.Hide_output = True # Hides the output. If set to False it will prints tweets in the terminal window.
|
75 |
-
twint.run.Search(self.conf)
|
76 |
-
tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output souldwe dataframe.
|
77 |
-
tweet_and_replies_inf = tweet_and_replies_inf[
|
78 |
-
["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
|
79 |
-
return tweet_and_replies_inf
|
80 |
-
# def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
|
81 |
-
# is not string it generates exception print("[!] Please make sure the date is a string in this format
|
82 |
-
# \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
|
83 |
-
# "-")))<2: print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ") raise EXCEPTION(
|
84 |
-
# "Incorrect date type Exception!")
|
85 |
-
if __name__ == "__main__":
|
86 |
-
sc = TwitterScraper(num_tweets=1002)
|
87 |
-
dc = sc.scrape_by_string("jimmieakesson")
|
88 |
-
print(dc.head())
|
89 |
-
print(dc.shape)
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|