Spaces:
Runtime error
Runtime error
added TwitterScraper file, changed requirements.txt
Browse files- requirements.txt +28 -0
- twitter-scraper/TwitterScraper.py +17 -0
- twitter-scraper/twint-master/scrape.py +45 -45
requirements.txt
CHANGED
@@ -1,23 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
certifi==2022.6.15
|
|
|
2 |
charset-normalizer==2.1.0
|
3 |
cycler==0.11.0
|
|
|
|
|
|
|
4 |
et-xmlfile==1.1.0
|
|
|
5 |
fonttools==4.34.0
|
|
|
|
|
|
|
|
|
6 |
idna==3.3
|
7 |
kiwisolver==1.4.3
|
8 |
matplotlib==3.5.2
|
|
|
9 |
numpy==1.23.0
|
|
|
10 |
openai==0.20.0
|
11 |
openpyxl==3.0.10
|
12 |
packaging==21.3
|
13 |
pandas==1.4.3
|
14 |
pandas-stubs==1.4.3.220704
|
15 |
Pillow==9.2.0
|
|
|
|
|
16 |
pyparsing==3.0.9
|
|
|
17 |
python-dateutil==2.8.2
|
|
|
18 |
pytz==2022.1
|
19 |
regex==2022.6.2
|
20 |
requests==2.28.1
|
|
|
|
|
21 |
six==1.16.0
|
|
|
22 |
tqdm==4.64.0
|
|
|
23 |
urllib3==1.26.9
|
|
|
|
1 |
+
aiodns==3.0.0
|
2 |
+
aiohttp==3.8.1
|
3 |
+
aiohttp-socks==0.7.1
|
4 |
+
aiosignal==1.2.0
|
5 |
+
async-timeout==4.0.2
|
6 |
+
attrs==21.4.0
|
7 |
+
beautifulsoup4==4.11.1
|
8 |
+
cchardet==2.1.7
|
9 |
certifi==2022.6.15
|
10 |
+
cffi==1.15.1
|
11 |
charset-normalizer==2.1.0
|
12 |
cycler==0.11.0
|
13 |
+
dataclasses==0.6
|
14 |
+
elastic-transport==8.1.2
|
15 |
+
elasticsearch==8.3.1
|
16 |
et-xmlfile==1.1.0
|
17 |
+
fake-useragent==0.1.11
|
18 |
fonttools==4.34.0
|
19 |
+
frozenlist==1.3.0
|
20 |
+
geographiclib==1.52
|
21 |
+
geopy==2.2.0
|
22 |
+
googletransx==2.4.2
|
23 |
idna==3.3
|
24 |
kiwisolver==1.4.3
|
25 |
matplotlib==3.5.2
|
26 |
+
multidict==6.0.2
|
27 |
numpy==1.23.0
|
28 |
+
oauthlib==3.2.0
|
29 |
openai==0.20.0
|
30 |
openpyxl==3.0.10
|
31 |
packaging==21.3
|
32 |
pandas==1.4.3
|
33 |
pandas-stubs==1.4.3.220704
|
34 |
Pillow==9.2.0
|
35 |
+
pycares==4.2.1
|
36 |
+
pycparser==2.21
|
37 |
pyparsing==3.0.9
|
38 |
+
PySocks==1.7.1
|
39 |
python-dateutil==2.8.2
|
40 |
+
python-socks==2.0.3
|
41 |
pytz==2022.1
|
42 |
regex==2022.6.2
|
43 |
requests==2.28.1
|
44 |
+
requests-oauthlib==1.3.1
|
45 |
+
schedule==1.1.0
|
46 |
six==1.16.0
|
47 |
+
soupsieve==2.3.2.post1
|
48 |
tqdm==4.64.0
|
49 |
+
-e git+https://github.com/twintproject/twint.git@e7c8a0c764f6879188e5c21e25fb6f1f856a7221#egg=twint
|
50 |
urllib3==1.26.9
|
51 |
+
yarl==1.7.2
|
twitter-scraper/TwitterScraper.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import twint
|
2 |
+
import datetime
|
3 |
+
|
4 |
+
c = twint.Config()
|
5 |
+
|
6 |
+
c.Search = ['Taylor Swift'] # topic
|
7 |
+
c.Limit = 500 # number of Tweets to scrape
|
8 |
+
c.Store_csv = True # store tweets in a csv file
|
9 |
+
c.Output = "taylor_swift_tweets.csv" # path to csv file
|
10 |
+
|
11 |
+
twint.run.Search(c)
|
12 |
+
|
13 |
+
import pandas as pd
|
14 |
+
|
15 |
+
df = pd.read_csv('taylor_swift_tweets.csv')
|
16 |
+
|
17 |
+
print(df.head())
|
twitter-scraper/twint-master/scrape.py
CHANGED
@@ -5,47 +5,52 @@ import asyncio
|
|
5 |
import os
|
6 |
from tkinter import EXCEPTION
|
7 |
from numpy import not_equal
|
|
|
8 |
loop = asyncio.get_event_loop()
|
9 |
loop.is_running()
|
10 |
import twint
|
11 |
import nest_asyncio
|
|
|
12 |
nest_asyncio.apply()
|
13 |
from datetime import date
|
|
|
|
|
14 |
class scraper:
|
15 |
-
def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10,u_or_s='s',
|
|
|
16 |
|
17 |
-
if (type(from_date) or
|
18 |
-
print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
|
19 |
raise EXCEPTION("Incorrect date type Exception!")
|
20 |
-
|
21 |
-
time_out= time.time()+2*60
|
22 |
-
_dict={}
|
23 |
-
c=twint.Config()
|
24 |
-
if u_or_s.lower() =="u":
|
25 |
-
c.Search = "from:@"+search_str
|
26 |
-
else:
|
27 |
-
c.Search = search_str
|
28 |
c.Pandas = True
|
29 |
-
num_tweets_and_replies=num_tweets
|
30 |
-
c.Count=True
|
31 |
-
for j in range(1,5):
|
32 |
c.Limit = num_tweets_and_replies
|
33 |
c.Since = from_date
|
34 |
-
c.Until =
|
35 |
-
c.Hide_output =True
|
36 |
old_stdout = sys.stdout
|
37 |
new_stdout = io.StringIO()
|
38 |
sys.stdout = new_stdout
|
39 |
-
twint.run.Search(c)
|
40 |
output = new_stdout.getvalue()
|
41 |
sys.stdout = old_stdout
|
42 |
print(output[0:-2])
|
43 |
-
tweet_info=twint.output.panda.Tweets_df
|
44 |
-
|
45 |
-
t_count=0
|
46 |
try:
|
47 |
-
_keys=tweet_info["id"]
|
48 |
-
#tweet infor is a dataframe with fallowing columns
|
49 |
'''Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
|
50 |
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
|
51 |
'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
|
@@ -55,48 +60,43 @@ class scraper:
|
|
55 |
'trans_dest'],
|
56 |
dtype='object')'''
|
57 |
|
58 |
-
for i in range
|
59 |
if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
|
60 |
pass
|
61 |
else:
|
62 |
_dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
|
63 |
-
"date"
|
64 |
"nlikes": tweet_info["nlikes"][i],
|
65 |
-
"nreplies":tweet_info["nreplies"][i]
|
66 |
-
"nretweets": tweet_info["nretweets"][i],"topic":""}
|
67 |
-
if len(list(_dict.keys()))==num_tweets:
|
68 |
break
|
69 |
except:
|
70 |
pass
|
71 |
print(len(list(_dict.keys())), " of them are Tweets")
|
72 |
-
if (num_tweets-len(list(_dict.keys())))< acceptable_range:
|
73 |
return _dict
|
74 |
if len(list(_dict.keys())) < num_tweets:
|
75 |
-
num_tweets_and_replies= num_tweets_and_replies+100*3**j
|
76 |
else:
|
77 |
break
|
78 |
-
if time_out <time.time():
|
79 |
break
|
80 |
if output.startswith("[!] No more data!"):
|
81 |
-
break
|
82 |
return _dict
|
83 |
|
84 |
-
def string_search_user_tweets(user_name,search_str
|
85 |
-
|
86 |
-
c
|
87 |
-
c.
|
|
|
88 |
c.Pandas = True
|
89 |
-
num_tweets_and_replies=num_tweets
|
90 |
-
c.Count=True
|
91 |
c.Limit = num_tweets_and_replies
|
92 |
c.Since = from_date
|
93 |
-
c.Until =
|
94 |
-
c.Hide_output =True
|
95 |
twint.run.Search(c)
|
96 |
return twint.output.panda.Tweets_df
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
5 |
import os
|
6 |
from tkinter import EXCEPTION
|
7 |
from numpy import not_equal
|
8 |
+
|
9 |
loop = asyncio.get_event_loop()
|
10 |
loop.is_running()
|
11 |
import twint
|
12 |
import nest_asyncio
|
13 |
+
|
14 |
nest_asyncio.apply()
|
15 |
from datetime import date
|
16 |
+
|
17 |
+
|
18 |
class scraper:
|
19 |
+
def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10, u_or_s='s',
|
20 |
+
acceptable_range=10):
|
21 |
|
22 |
+
if (type(from_date) or type("str")) is not type("str"):
|
23 |
+
print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
|
24 |
raise EXCEPTION("Incorrect date type Exception!")
|
25 |
+
|
26 |
+
time_out = time.time() + 2 * 60
|
27 |
+
_dict = {}
|
28 |
+
c = twint.Config()
|
29 |
+
if u_or_s.lower() == "u":
|
30 |
+
c.Search = "from:@" + search_str # topic
|
31 |
+
else:
|
32 |
+
c.Search = search_str # topic
|
33 |
c.Pandas = True
|
34 |
+
num_tweets_and_replies = num_tweets
|
35 |
+
c.Count = True
|
36 |
+
for j in range(1, 5):
|
37 |
c.Limit = num_tweets_and_replies
|
38 |
c.Since = from_date
|
39 |
+
c.Until = to_date
|
40 |
+
c.Hide_output = True
|
41 |
old_stdout = sys.stdout
|
42 |
new_stdout = io.StringIO()
|
43 |
sys.stdout = new_stdout
|
44 |
+
twint.run.Search(c)
|
45 |
output = new_stdout.getvalue()
|
46 |
sys.stdout = old_stdout
|
47 |
print(output[0:-2])
|
48 |
+
tweet_info = twint.output.panda.Tweets_df
|
49 |
+
|
50 |
+
t_count = 0
|
51 |
try:
|
52 |
+
_keys = tweet_info["id"]
|
53 |
+
# tweet infor is a dataframe with fallowing columns
|
54 |
'''Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
|
55 |
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
|
56 |
'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
|
|
|
60 |
'trans_dest'],
|
61 |
dtype='object')'''
|
62 |
|
63 |
+
for i in range(len(_keys)):
|
64 |
if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
|
65 |
pass
|
66 |
else:
|
67 |
_dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
|
68 |
+
"date": tweet_info["date"][i],
|
69 |
"nlikes": tweet_info["nlikes"][i],
|
70 |
+
"nreplies": tweet_info["nreplies"][i],
|
71 |
+
"nretweets": tweet_info["nretweets"][i], "topic": ""}
|
72 |
+
if len(list(_dict.keys())) == num_tweets:
|
73 |
break
|
74 |
except:
|
75 |
pass
|
76 |
print(len(list(_dict.keys())), " of them are Tweets")
|
77 |
+
if (num_tweets - len(list(_dict.keys()))) < acceptable_range:
|
78 |
return _dict
|
79 |
if len(list(_dict.keys())) < num_tweets:
|
80 |
+
num_tweets_and_replies = num_tweets_and_replies + 100 * 3 ** j
|
81 |
else:
|
82 |
break
|
83 |
+
if time_out < time.time():
|
84 |
break
|
85 |
if output.startswith("[!] No more data!"):
|
86 |
+
break
|
87 |
return _dict
|
88 |
|
89 |
+
def string_search_user_tweets(user_name, search_str, from_date="2006-07-01", to_date=str(date.today()),
|
90 |
+
num_tweets=10):
|
91 |
+
c = twint.Config()
|
92 |
+
c.Username = user_name
|
93 |
+
c.Search = search_str # topic
|
94 |
c.Pandas = True
|
95 |
+
num_tweets_and_replies = num_tweets
|
96 |
+
c.Count = True
|
97 |
c.Limit = num_tweets_and_replies
|
98 |
c.Since = from_date
|
99 |
+
c.Until = to_date
|
100 |
+
c.Hide_output = True
|
101 |
twint.run.Search(c)
|
102 |
return twint.output.panda.Tweets_df
|
|
|
|
|
|
|
|
|
|
|
|