Demea9000 commited on
Commit
b24e23b
1 Parent(s): 76b4944

added TwitterScraper file, changed requirements.txt

Browse files
requirements.txt CHANGED
@@ -1,23 +1,51 @@
 
 
 
 
 
 
 
 
1
  certifi==2022.6.15
 
2
  charset-normalizer==2.1.0
3
  cycler==0.11.0
 
 
 
4
  et-xmlfile==1.1.0
 
5
  fonttools==4.34.0
 
 
 
 
6
  idna==3.3
7
  kiwisolver==1.4.3
8
  matplotlib==3.5.2
 
9
  numpy==1.23.0
 
10
  openai==0.20.0
11
  openpyxl==3.0.10
12
  packaging==21.3
13
  pandas==1.4.3
14
  pandas-stubs==1.4.3.220704
15
  Pillow==9.2.0
 
 
16
  pyparsing==3.0.9
 
17
  python-dateutil==2.8.2
 
18
  pytz==2022.1
19
  regex==2022.6.2
20
  requests==2.28.1
 
 
21
  six==1.16.0
 
22
  tqdm==4.64.0
 
23
  urllib3==1.26.9
 
 
1
+ aiodns==3.0.0
2
+ aiohttp==3.8.1
3
+ aiohttp-socks==0.7.1
4
+ aiosignal==1.2.0
5
+ async-timeout==4.0.2
6
+ attrs==21.4.0
7
+ beautifulsoup4==4.11.1
8
+ cchardet==2.1.7
9
  certifi==2022.6.15
10
+ cffi==1.15.1
11
  charset-normalizer==2.1.0
12
  cycler==0.11.0
13
+ dataclasses==0.6
14
+ elastic-transport==8.1.2
15
+ elasticsearch==8.3.1
16
  et-xmlfile==1.1.0
17
+ fake-useragent==0.1.11
18
  fonttools==4.34.0
19
+ frozenlist==1.3.0
20
+ geographiclib==1.52
21
+ geopy==2.2.0
22
+ googletransx==2.4.2
23
  idna==3.3
24
  kiwisolver==1.4.3
25
  matplotlib==3.5.2
26
+ multidict==6.0.2
27
  numpy==1.23.0
28
+ oauthlib==3.2.0
29
  openai==0.20.0
30
  openpyxl==3.0.10
31
  packaging==21.3
32
  pandas==1.4.3
33
  pandas-stubs==1.4.3.220704
34
  Pillow==9.2.0
35
+ pycares==4.2.1
36
+ pycparser==2.21
37
  pyparsing==3.0.9
38
+ PySocks==1.7.1
39
  python-dateutil==2.8.2
40
+ python-socks==2.0.3
41
  pytz==2022.1
42
  regex==2022.6.2
43
  requests==2.28.1
44
+ requests-oauthlib==1.3.1
45
+ schedule==1.1.0
46
  six==1.16.0
47
+ soupsieve==2.3.2.post1
48
  tqdm==4.64.0
49
+ -e git+https://github.com/twintproject/twint.git@e7c8a0c764f6879188e5c21e25fb6f1f856a7221#egg=twint
50
  urllib3==1.26.9
51
+ yarl==1.7.2
twitter-scraper/TwitterScraper.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import twint
2
+ import datetime
3
+
4
+ c = twint.Config()
5
+
6
+ c.Search = ['Taylor Swift'] # topic
7
+ c.Limit = 500 # number of Tweets to scrape
8
+ c.Store_csv = True # store tweets in a csv file
9
+ c.Output = "taylor_swift_tweets.csv" # path to csv file
10
+
11
+ twint.run.Search(c)
12
+
13
+ import pandas as pd
14
+
15
+ df = pd.read_csv('taylor_swift_tweets.csv')
16
+
17
+ print(df.head())
twitter-scraper/twint-master/scrape.py CHANGED
@@ -5,47 +5,52 @@ import asyncio
5
  import os
6
  from tkinter import EXCEPTION
7
  from numpy import not_equal
 
8
  loop = asyncio.get_event_loop()
9
  loop.is_running()
10
  import twint
11
  import nest_asyncio
 
12
  nest_asyncio.apply()
13
  from datetime import date
 
 
14
  class scraper:
15
- def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10,u_or_s='s', acceptable_range=10):
 
16
 
17
- if (type(from_date) or type("str")) is not type("str"):
18
- print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
19
  raise EXCEPTION("Incorrect date type Exception!")
20
-
21
- time_out= time.time()+2*60
22
- _dict={}
23
- c=twint.Config()
24
- if u_or_s.lower() =="u":
25
- c.Search = "from:@"+search_str # topic
26
- else:
27
- c.Search = search_str # topic
28
  c.Pandas = True
29
- num_tweets_and_replies=num_tweets
30
- c.Count=True
31
- for j in range(1,5):
32
  c.Limit = num_tweets_and_replies
33
  c.Since = from_date
34
- c.Until = to_date
35
- c.Hide_output =True
36
  old_stdout = sys.stdout
37
  new_stdout = io.StringIO()
38
  sys.stdout = new_stdout
39
- twint.run.Search(c)
40
  output = new_stdout.getvalue()
41
  sys.stdout = old_stdout
42
  print(output[0:-2])
43
- tweet_info=twint.output.panda.Tweets_df
44
-
45
- t_count=0
46
  try:
47
- _keys=tweet_info["id"]
48
- #tweet infor is a dataframe with fallowing columns
49
  '''Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
50
  'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
51
  'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
@@ -55,48 +60,43 @@ class scraper:
55
  'trans_dest'],
56
  dtype='object')'''
57
 
58
- for i in range (len(_keys)):
59
  if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
60
  pass
61
  else:
62
  _dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
63
- "date" :tweet_info["date"][i],
64
  "nlikes": tweet_info["nlikes"][i],
65
- "nreplies":tweet_info["nreplies"][i] ,
66
- "nretweets": tweet_info["nretweets"][i],"topic":""}
67
- if len(list(_dict.keys()))==num_tweets:
68
  break
69
  except:
70
  pass
71
  print(len(list(_dict.keys())), " of them are Tweets")
72
- if (num_tweets-len(list(_dict.keys())))< acceptable_range:
73
  return _dict
74
  if len(list(_dict.keys())) < num_tweets:
75
- num_tweets_and_replies= num_tweets_and_replies+100*3**j
76
  else:
77
  break
78
- if time_out <time.time():
79
  break
80
  if output.startswith("[!] No more data!"):
81
- break
82
  return _dict
83
 
84
- def string_search_user_tweets(user_name,search_str ,from_date="2006-07-01", to_date=str(date.today()), num_tweets=10):
85
- c=twint.Config()
86
- c.Username =user_name
87
- c.Search = search_str # topic
 
88
  c.Pandas = True
89
- num_tweets_and_replies=num_tweets
90
- c.Count=True
91
  c.Limit = num_tweets_and_replies
92
  c.Since = from_date
93
- c.Until = to_date
94
- c.Hide_output =True
95
  twint.run.Search(c)
96
  return twint.output.panda.Tweets_df
97
-
98
-
99
-
100
-
101
-
102
-
 
5
  import os
6
  from tkinter import EXCEPTION
7
  from numpy import not_equal
8
+
9
  loop = asyncio.get_event_loop()
10
  loop.is_running()
11
  import twint
12
  import nest_asyncio
13
+
14
  nest_asyncio.apply()
15
  from datetime import date
16
+
17
+
18
  class scraper:
19
+ def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10, u_or_s='s',
20
+ acceptable_range=10):
21
 
22
+ if (type(from_date) or type("str")) is not type("str"):
23
+ print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
24
  raise EXCEPTION("Incorrect date type Exception!")
25
+
26
+ time_out = time.time() + 2 * 60
27
+ _dict = {}
28
+ c = twint.Config()
29
+ if u_or_s.lower() == "u":
30
+ c.Search = "from:@" + search_str # topic
31
+ else:
32
+ c.Search = search_str # topic
33
  c.Pandas = True
34
+ num_tweets_and_replies = num_tweets
35
+ c.Count = True
36
+ for j in range(1, 5):
37
  c.Limit = num_tweets_and_replies
38
  c.Since = from_date
39
+ c.Until = to_date
40
+ c.Hide_output = True
41
  old_stdout = sys.stdout
42
  new_stdout = io.StringIO()
43
  sys.stdout = new_stdout
44
+ twint.run.Search(c)
45
  output = new_stdout.getvalue()
46
  sys.stdout = old_stdout
47
  print(output[0:-2])
48
+ tweet_info = twint.output.panda.Tweets_df
49
+
50
+ t_count = 0
51
  try:
52
+ _keys = tweet_info["id"]
53
+ # tweet infor is a dataframe with fallowing columns
54
  '''Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
55
  'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
56
  'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
 
60
  'trans_dest'],
61
  dtype='object')'''
62
 
63
+ for i in range(len(_keys)):
64
  if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
65
  pass
66
  else:
67
  _dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
68
+ "date": tweet_info["date"][i],
69
  "nlikes": tweet_info["nlikes"][i],
70
+ "nreplies": tweet_info["nreplies"][i],
71
+ "nretweets": tweet_info["nretweets"][i], "topic": ""}
72
+ if len(list(_dict.keys())) == num_tweets:
73
  break
74
  except:
75
  pass
76
  print(len(list(_dict.keys())), " of them are Tweets")
77
+ if (num_tweets - len(list(_dict.keys()))) < acceptable_range:
78
  return _dict
79
  if len(list(_dict.keys())) < num_tweets:
80
+ num_tweets_and_replies = num_tweets_and_replies + 100 * 3 ** j
81
  else:
82
  break
83
+ if time_out < time.time():
84
  break
85
  if output.startswith("[!] No more data!"):
86
+ break
87
  return _dict
88
 
89
+ def string_search_user_tweets(user_name, search_str, from_date="2006-07-01", to_date=str(date.today()),
90
+ num_tweets=10):
91
+ c = twint.Config()
92
+ c.Username = user_name
93
+ c.Search = search_str # topic
94
  c.Pandas = True
95
+ num_tweets_and_replies = num_tweets
96
+ c.Count = True
97
  c.Limit = num_tweets_and_replies
98
  c.Since = from_date
99
+ c.Until = to_date
100
+ c.Hide_output = True
101
  twint.run.Search(c)
102
  return twint.output.panda.Tweets_df