4stra commited on
Commit
209bbc5
1 Parent(s): 70f6eb7

Add files via upload

Browse files
Files changed (1) hide show
  1. twitter-scraper/scrape.py +94 -0
twitter-scraper/scrape.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import io
3
+ import time
4
+ import asyncio
5
+ import os
6
+ loop = asyncio.get_event_loop()
7
+ loop.is_running()
8
+ import twint
9
+ import nest_asyncio
10
+ nest_asyncio.apply()
11
+ from datetime import date
12
+ class scraper:
13
+ def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10,u_or_s='s'):
14
+ time_out= time.time()+2*60
15
+ _dict={}
16
+ c=twint.Config()
17
+ if u_or_s.lower() =="u":
18
+ c.Search = "from:@"+search_str # topic
19
+ else:
20
+ c.Search = search_str # topic
21
+ c.Pandas = True
22
+ num_tweets_and_replies=num_tweets
23
+ c.Count=True
24
+ for j in range(1,5):
25
+ c.Limit = num_tweets_and_replies
26
+ c.Since = from_date
27
+ c.Until = to_date
28
+ c.Hide_output =True
29
+ old_stdout = sys.stdout
30
+ new_stdout = io.StringIO()
31
+ sys.stdout = new_stdout
32
+ twint.run.Search(c)
33
+ output = new_stdout.getvalue()
34
+ sys.stdout = old_stdout
35
+ print(output[0:-2])
36
+ tweet_info=twint.output.panda.Tweets_df
37
+
38
+ t_count=0
39
+ try:
40
+ _keys=tweet_info["id"]
41
+ #tweet infor is a dataframe with fallowing columns
42
+ '''Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
43
+ 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
44
+ 'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
45
+ 'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
46
+ 'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
47
+ 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
48
+ 'trans_dest'],
49
+ dtype='object')'''
50
+
51
+ for i in range (len(_keys)):
52
+ if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
53
+ pass
54
+ else:
55
+ _dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
56
+ "date" :tweet_info["date"][i],
57
+ "nlikes": tweet_info["nlikes"][i],
58
+ "nreplies":tweet_info["nreplies"][i] ,
59
+ "nretweets": tweet_info["nretweets"][i],"topic":""}
60
+ if len(list(_dict.keys()))==num_tweets:
61
+ break
62
+ except:
63
+ pass
64
+ print(len(list(_dict.keys())), " of them are Tweets")
65
+ if len(list(_dict.keys())) < num_tweets:
66
+ num_tweets_and_replies= num_tweets_and_replies+100*3**j
67
+
68
+ if len(list(_dict.keys())) < num_tweets:
69
+ num_tweets_and_replies= num_tweets_and_replies+100*3**j
70
+ else:
71
+ break
72
+ if time_out <time.time():
73
+ break
74
+ if output.startswith("[!] No more data!"):
75
+ break
76
+ return _dict
77
+
78
+ def string_search_user_tweets(user_name,search_str ,from_date="2006-07-01", to_date=str(date.today()), num_tweets=10):
79
+ c=twint.Config()
80
+ c.Username =user_name
81
+ c.Search = search_str # topic
82
+ c.Pandas = True
83
+ num_tweets_and_replies=num_tweets
84
+ c.Count=True
85
+ c.Limit = num_tweets_and_replies
86
+ c.Since = from_date
87
+ c.Until = to_date
88
+ c.Hide_output =True
89
+ twint.run.Search(c)
90
+ return twint.output.panda.Tweets_df
91
+
92
+
93
+
94
+