Spaces:
Runtime error
Runtime error
4stra
commited on
Commit
•
209bbc5
1
Parent(s):
70f6eb7
Add files via upload
Browse files- twitter-scraper/scrape.py +94 -0
twitter-scraper/scrape.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import io
|
3 |
+
import time
|
4 |
+
import asyncio
|
5 |
+
import os
|
6 |
+
loop = asyncio.get_event_loop()
|
7 |
+
loop.is_running()
|
8 |
+
import twint
|
9 |
+
import nest_asyncio
|
10 |
+
nest_asyncio.apply()
|
11 |
+
from datetime import date
|
12 |
+
class scraper:
|
13 |
+
def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10,u_or_s='s'):
|
14 |
+
time_out= time.time()+2*60
|
15 |
+
_dict={}
|
16 |
+
c=twint.Config()
|
17 |
+
if u_or_s.lower() =="u":
|
18 |
+
c.Search = "from:@"+search_str # topic
|
19 |
+
else:
|
20 |
+
c.Search = search_str # topic
|
21 |
+
c.Pandas = True
|
22 |
+
num_tweets_and_replies=num_tweets
|
23 |
+
c.Count=True
|
24 |
+
for j in range(1,5):
|
25 |
+
c.Limit = num_tweets_and_replies
|
26 |
+
c.Since = from_date
|
27 |
+
c.Until = to_date
|
28 |
+
c.Hide_output =True
|
29 |
+
old_stdout = sys.stdout
|
30 |
+
new_stdout = io.StringIO()
|
31 |
+
sys.stdout = new_stdout
|
32 |
+
twint.run.Search(c)
|
33 |
+
output = new_stdout.getvalue()
|
34 |
+
sys.stdout = old_stdout
|
35 |
+
print(output[0:-2])
|
36 |
+
tweet_info=twint.output.panda.Tweets_df
|
37 |
+
|
38 |
+
t_count=0
|
39 |
+
try:
|
40 |
+
_keys=tweet_info["id"]
|
41 |
+
#tweet infor is a dataframe with fallowing columns
|
42 |
+
'''Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
|
43 |
+
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
|
44 |
+
'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
|
45 |
+
'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
|
46 |
+
'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
|
47 |
+
'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
|
48 |
+
'trans_dest'],
|
49 |
+
dtype='object')'''
|
50 |
+
|
51 |
+
for i in range (len(_keys)):
|
52 |
+
if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
|
53 |
+
pass
|
54 |
+
else:
|
55 |
+
_dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
|
56 |
+
"date" :tweet_info["date"][i],
|
57 |
+
"nlikes": tweet_info["nlikes"][i],
|
58 |
+
"nreplies":tweet_info["nreplies"][i] ,
|
59 |
+
"nretweets": tweet_info["nretweets"][i],"topic":""}
|
60 |
+
if len(list(_dict.keys()))==num_tweets:
|
61 |
+
break
|
62 |
+
except:
|
63 |
+
pass
|
64 |
+
print(len(list(_dict.keys())), " of them are Tweets")
|
65 |
+
if len(list(_dict.keys())) < num_tweets:
|
66 |
+
num_tweets_and_replies= num_tweets_and_replies+100*3**j
|
67 |
+
|
68 |
+
if len(list(_dict.keys())) < num_tweets:
|
69 |
+
num_tweets_and_replies= num_tweets_and_replies+100*3**j
|
70 |
+
else:
|
71 |
+
break
|
72 |
+
if time_out <time.time():
|
73 |
+
break
|
74 |
+
if output.startswith("[!] No more data!"):
|
75 |
+
break
|
76 |
+
return _dict
|
77 |
+
|
78 |
+
def string_search_user_tweets(user_name,search_str ,from_date="2006-07-01", to_date=str(date.today()), num_tweets=10):
|
79 |
+
c=twint.Config()
|
80 |
+
c.Username =user_name
|
81 |
+
c.Search = search_str # topic
|
82 |
+
c.Pandas = True
|
83 |
+
num_tweets_and_replies=num_tweets
|
84 |
+
c.Count=True
|
85 |
+
c.Limit = num_tweets_and_replies
|
86 |
+
c.Since = from_date
|
87 |
+
c.Until = to_date
|
88 |
+
c.Hide_output =True
|
89 |
+
twint.run.Search(c)
|
90 |
+
return twint.output.panda.Tweets_df
|
91 |
+
|
92 |
+
|
93 |
+
|
94 |
+
|