Demea9000 commited on
Commit
7edfe41
1 Parent(s): dcd24fe

cleaned up TwitterScraper

Browse files
.idea/misc.xml CHANGED
@@ -1,4 +1,4 @@
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
3
- <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (politweet)" project-jdk-type="Python SDK" />
4
  </project>
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (politweet)" project-jdk-type="Python SDK" />
4
  </project>
.idea/politweet.iml CHANGED
@@ -5,7 +5,7 @@
5
  <excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
6
  <excludeFolder url="file://$MODULE_DIR$/venv" />
7
  </content>
8
- <orderEntry type="inheritedJdk" />
9
  <orderEntry type="sourceFolder" forTests="false" />
10
  </component>
11
  </module>
5
  <excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
6
  <excludeFolder url="file://$MODULE_DIR$/venv" />
7
  </content>
8
+ <orderEntry type="jdk" jdkName="Python 3.9 (politweet)" jdkType="Python SDK" />
9
  <orderEntry type="sourceFolder" forTests="false" />
10
  </component>
11
  </module>
twitter-scraper/TwitterScraper.py CHANGED
@@ -1,17 +1,91 @@
1
  import twint
2
- import datetime
3
 
4
- c = twint.Config()
5
 
6
- c.Search = ['Taylor Swift'] # topic
7
- c.Limit = 500 # number of Tweets to scrape
8
- c.Store_csv = True # store tweets in a csv file
9
- c.Output = "taylor_swift_tweets.csv" # path to csv file
 
 
 
 
 
 
 
 
 
10
 
11
- twint.run.Search(c)
 
 
 
 
12
 
13
- import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- df = pd.read_csv('taylor_swift_tweets.csv')
16
 
17
- print(df.head())
1
  import twint
2
+ from datetime import date
3
 
 
4
 
5
+ class TwitterScraper(object):
6
+ """
7
+ This class is a twitter TwitterScraper called TwitterScraper. It takes the user as input and collects the user's tweets
8
+ from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of tweets 'num_tweets' from today.
9
+ It outputs a dictionary with the tweet unique id and some other information.
10
+ input: user, from_date, to_date, num_tweets
11
+ output: dict
12
+ """
13
+ def __init__(self, from_date="2022-07-01", to_date=str(date.today()), num_tweets=20):
14
+ self.from_date = from_date
15
+ self.to_date = to_date
16
+ self.num_tweets = num_tweets
17
+ self.conf = twint.Config()
18
 
19
+ def scrape_by_user(self, _user):
20
+ """This method uses twint to extract tweets based on username"""
21
+ self.conf.Search = "from:@" + _user # is the search configuration is given in this format it searches after
22
+ # user_names.
23
+ return self.__get_tweets__from_twint__()
24
 
25
+ def scrape_by_string(self, _string: str):
26
+ """This method uses twint to extract tweets based on string.
27
+ all extracted tweets have the specified word in _string parameter in it.
28
+ """
29
+ self.conf.Search = _string # this tells twint configuration to search for string
30
+ return self.__get_tweets__from_twint__()
31
+
32
+ def scrape_by_user_and_string(self, _user: str, _string: str):
33
+ """This method uses twint to extract tweets brased on string and username"""
34
+ self.conf.Username = _user
35
+ self.conf.Search = _string
36
+ return self.__get_tweets__from_twint__()
37
+
38
+ def get_only_tweets(self, tweet_and_replies_info):
39
+ tweet_and_replies = tweet_and_replies_info["tweet"]
40
+ """
41
+ This functions input arg is a data frame (the output from scrape methords ) and removes...
42
+ all tweets starting with \"@\" which is indicator of a reply or retweet.
43
+ """
44
+ indx_replies = []
45
+ for i in range(len(tweet_and_replies)):
46
+ if tweet_and_replies[i].startswith("@"):
47
+ indx_replies.append(i)
48
+
49
+ tweets_info = tweet_and_replies_info.drop(labels=indx_replies, axis=0)
50
+ # drop removes the columns which its index specified by
51
+ # indx_replies. axis=0 if we want to delete rows.
52
+ #print(len(tweets['tweet']), " of them are Tweets")
53
+ return tweets_info
54
+
55
+ def __get_tweets__from_twint__(self):
56
+ """ __get_tweets_from_twint__
57
+ tweet info is a dataframe with fallowing columns
58
+ Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
59
+ 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
60
+ 'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
61
+ 'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
62
+ 'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
63
+ 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
64
+ 'trans_dest']
65
+ we just pick the relevant ones.
66
+ c is a twint.Config() object
67
+ we also configure twint output.
68
+ """
69
+ self.conf.Pandas = True #
70
+ self.conf.Count = True #
71
+ self.conf.Limit = self.num_tweets # specifies how many tweet should be scraped
72
+ self.conf.Since = self.from_date
73
+ self.conf.Until = self.to_date
74
+ self.conf.Hide_output = True # Hides the output. If set to False it will prints tweets in the terminal window.
75
+ twint.run.Search(self.conf)
76
+ tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output souldwe dataframe.
77
+ tweet_and_replies_inf = tweet_and_replies_inf[
78
+ ["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
79
+ return tweet_and_replies_inf
80
+ # def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
81
+ # is not string it generates exception print("[!] Please make sure the date is a string in this format
82
+ # \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
83
+ # "-")))<2: print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ") raise EXCEPTION(
84
+ # "Incorrect date type Exception!")
85
+ if __name__ == "__main__":
86
+ sc = TwitterScraper(from_date="2022-05-01", to_date="2022-07-31", num_tweets=40)
87
+ dc = sc.scrape_by_user("jimmieakesson")
88
+ print(dc.head())
89
+ print(dc.shape)
90
 
 
91
 
 
twitter-scraper/scrape.py DELETED
@@ -1,91 +0,0 @@
1
- import twint
2
- from datetime import date
3
-
4
-
5
- class TwitterScraper(object):
6
- """
7
- This class is a twitter TwitterScraper called TwitterScraper. It takes the user as input and collects the user's tweets
8
- from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of tweets 'num_tweets' from today.
9
- It outputs a dictionary with the tweet unique id and some other information.
10
- input: user, from_date, to_date, num_tweets
11
- output: dict
12
- """
13
- def __init__(self, from_date="2006-07-01", to_date=str(date.today()), num_tweets=20):
14
- self.from_date = from_date
15
- self.to_date = to_date
16
- self.num_tweets = num_tweets
17
- self.conf = twint.Config()
18
-
19
- def scrape_by_user(self, _user):
20
- """This method uses twint to extract tweets based on username"""
21
- self.conf.Search = "from:@" + _user # is the search configuration is given in this format it searches after
22
- # user_names.
23
- return self.__get_tweets__from_twint__()
24
-
25
- def scrape_by_string(self, _string: str):
26
- """This method uses twint to extract tweets based on string.
27
- all extracted tweets have the specified word in _string parameter in it.
28
- """
29
- self.conf.Search = _string # this tells twint configuration to search for string
30
- return self.__get_tweets__from_twint__()
31
-
32
- def scrape_by_user_and_string(self, _user: str, _string: str):
33
- """This method uses twint to extract tweets brased on string and username"""
34
- self.conf.Username = _user
35
- self.conf.Search = _string
36
- return self.__get_tweets__from_twint__()
37
-
38
- def get_only_tweets(self, tweet_and_replies_info):
39
- tweet_and_replies = tweet_and_replies_info["tweet"]
40
- """
41
- This functions input arg is a data frame (the output from scrape methords ) and removes...
42
- all tweets starting with \"@\" which is indicator of a reply or retweet.
43
- """
44
- indx_replies = []
45
- for i in range(len(tweet_and_replies)):
46
- if tweet_and_replies[i].startswith("@"):
47
- indx_replies.append(i)
48
-
49
- tweets_info = tweet_and_replies_info.drop(labels=indx_replies, axis=0)
50
- # drop removes the columns which its index specified by
51
- # indx_replies. axis=0 if we want to delete rows.
52
- #print(len(tweets['tweet']), " of them are Tweets")
53
- return tweets_info
54
-
55
- def __get_tweets__from_twint__(self):
56
- """ __get_tweets_from_twint__
57
- tweet info is a dataframe with fallowing columns
58
- Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
59
- 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
60
- 'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
61
- 'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
62
- 'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
63
- 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
64
- 'trans_dest']
65
- we just pick the relevant ones.
66
- c is a twint.Config() object
67
- we also configure twint output.
68
- """
69
- self.conf.Pandas = True #
70
- self.conf.Count = True #
71
- self.conf.Limit = self.num_tweets # specifies how many tweet should be scraped
72
- self.conf.Since = self.from_date
73
- self.conf.Until = self.to_date
74
- self.conf.Hide_output = True # Hides the output. If set to False it will prints tweets in the terminal window.
75
- twint.run.Search(self.conf)
76
- tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output souldwe dataframe.
77
- tweet_and_replies_inf = tweet_and_replies_inf[
78
- ["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
79
- return tweet_and_replies_inf
80
- # def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
81
- # is not string it generates exception print("[!] Please make sure the date is a string in this format
82
- # \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
83
- # "-")))<2: print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ") raise EXCEPTION(
84
- # "Incorrect date type Exception!")
85
- if __name__ == "__main__":
86
- sc = TwitterScraper(num_tweets=1002)
87
- dc = sc.scrape_by_string("jimmieakesson")
88
- print(dc.head())
89
- print(dc.shape)
90
-
91
-