Mosa commited on
Commit
d94f83a
1 Parent(s): 35c6ca6

Fixed_the_comments

Browse files
twitter-scraper/scrape.py CHANGED
@@ -2,102 +2,76 @@
2
  from tkinter import EXCEPTION
3
  import twint
4
  from datetime import date
5
- import pandas as pd
6
- import sys
7
- import io
8
- import time
 
 
 
9
  class scraper:
10
- def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10, u_or_s='s',
11
- acceptable_range=10):
12
- if (type(from_date) or type("str")) is not type("str"):
13
- print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
14
- raise EXCEPTION("Incorrect date type Exception!")
 
15
 
16
- time_out = time.time() + 2 * 60
17
- _dict = {}
18
- c = twint.Config()
19
- if u_or_s.lower() == "u":
20
- c.Search = "from:@" + search_str # topic
21
- else:
22
- c.Search = search_str # topic
23
- c.Pandas = True
24
- num_tweets_and_replies = num_tweets
25
- c.Count = True
26
- #for j in range(1, 5):
27
- c.Limit = num_tweets_and_replies
28
- c.Since = from_date
29
- c.Until = to_date
30
- c.Hide_output = True
31
- old_stdout = sys.stdout
32
- new_stdout = io.StringIO()
33
- sys.stdout = new_stdout
34
- twint.run.Search(c)
35
- output = new_stdout.getvalue()
36
- sys.stdout = old_stdout
37
- print(output[0:-2])
38
- tweet_info =twint.output.panda.Tweets_df
39
  indx_replies=[]
40
- tweet=tweet_info['tweet']
41
  for i in range(len(tweet)):
42
  if tweet[i].startswith("@"):
43
  indx_replies.append(i)
44
- tweet_info.drop(indx_replies,axis=0, inplace =True)
45
- print(len(tweet_info['tweet']), " of them are Tweets")
46
- #df.drop([5,6], axis=0, inplace=True)
47
- return tweet_info
48
-
49
 
 
 
 
 
50
 
51
-
52
- # try:
53
- # _keys = tweet_info[["id","tweet","date","user_id","urls" ,'nlikes', 'nreplies', 'nretweets']]
54
- # # tweet infor is a dataframe with fallowing columns
55
- # # Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
56
- # # 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
57
- # # 'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
58
- # # 'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
59
- # # 'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
60
- # # 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
61
- # # 'trans_dest'],
62
- # # dtype='object')
63
-
64
- # for i in range(len( _keys)):
65
- # if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
66
- # pass
67
- # else:
68
- # _dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
69
- # "date": tweet_info["date"][i],
70
- # "nlikes": tweet_info["nlikes"][i],
71
- # "nreplies": tweet_info["nreplies"][i],
72
- # "nretweets": tweet_info["nretweets"][i], "topic": ""}
73
- # if len(list(_dict.keys())) == num_tweets:
74
- # break
75
- # except:
76
- # pass
77
- # print(len(list(_dict.keys())), " of them are Tweets")
78
- # if (num_tweets - len(list(_dict.keys()))) < acceptable_range:
79
- # return _dict
80
- # if len(list(_dict.keys())) < num_tweets:
81
- # num_tweets_and_replies = num_tweets_and_replies + 100 * 3 ** j
82
- # else:
83
- # break
84
- # if time_out < time.time():
85
- # break
86
- # if output.startswith("[!] No more data!"):
87
- # break
88
- #return _dict
89
-
90
- def string_search_user_tweets(user_name, search_str, from_date="2006-07-01", to_date=str(date.today()),
91
- num_tweets=10):
92
- c = twint.Config()
93
- c.Username = user_name
94
- c.Search = search_str # topic
95
- c.Pandas = True
96
- num_tweets_and_replies = num_tweets
97
- c.Count = True
98
- c.Limit = num_tweets_and_replies
99
- c.Since = from_date
100
- c.Until = to_date
101
- c.Hide_output = True
102
- twint.run.Search(c)
103
- return twint.output.panda.Tweets_df
 
2
  from tkinter import EXCEPTION
3
  import twint
4
  from datetime import date
5
+ """
6
+ This class is a twitter scraper called TwitterScraper. It takes the user as input and collects the user's tweets
7
+ from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of tweets 'num_tweets' from today.
8
+ It outputs a dictionary with the tweet unique id and some other information.
9
+ input: user, from_date, to_date, num_tweets
10
+ output: dict
11
+ """
12
  class scraper:
13
+ def __init__(self, from_date="2006-07-01", to_date=str(date.today()), num_tweets=20):
14
+ #self.user = user
15
+ self.from_date = from_date
16
+ self.to_date = to_date
17
+ self.num_tweets = num_tweets
18
+ self.conf = twint.Config()
19
 
20
+ def scrape_by_user(self,_user):
21
+ ##using twint to extract tweets
22
+ self.conf.Search = "from:@" + _user # If the search string is a username.
23
+ return self.__get_tweets_from_twint__()
24
+
25
+ def scrape_by_string(self,_string:str):
26
+ self.conf.Search = _string
27
+ return self.__get_tweets_from_twint__()
28
+
29
+ def scrape_by_user_and_string(self,_user:str,_string:str):
30
+ self.conf.Username = _user
31
+ self.conf.Search = _string
32
+ return self.__get_tweets_from_twint__()
33
+
34
+ def __get_only_tweets(tweet_and_replies):
35
+ #This functions input arg is a data frame with tweets and removes all tweets with starting with \"@\" which is indicator of a reply or retweet.
36
+ tweet=tweet_and_replies["tweet"]
 
 
 
 
 
 
37
  indx_replies=[]
 
38
  for i in range(len(tweet)):
39
  if tweet[i].startswith("@"):
40
  indx_replies.append(i)
 
 
 
 
 
41
 
42
+ only_tweets=tweet_and_replies.drop(labels=indx_replies,axis=0)
43
+ # drop removes the columns which its index specified by indx_replies... axis=0 if we want to delete rows, and inplace changes the same data_frame without creating a new.
44
+ #print(len(tweet_and_replies['tweet']), " of them are Tweets")
45
+ return only_tweets
46
 
47
+ def __get_tweets_from_twint__(self):
48
+ ''' __get_tweets_from_twint__
49
+ tweet info is a dataframe with fallowing columns
50
+ Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
51
+ 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
52
+ 'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
53
+ 'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
54
+ 'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
55
+ 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
56
+ 'trans_dest']
57
+ we just pick the relevant ones.
58
+ c is a twint.Config() object
59
+ '''
60
+ self.conf.Pandas = True
61
+ self.conf.Count = True
62
+ self.conf.Limit = self.num_tweets
63
+ self.conf.Since = self.from_date
64
+ self.conf.Until = self.to_date
65
+ self.conf.Hide_output = True
66
+ twint.run.Search(self.conf)
67
+ tweet_info =twint.output.panda.Tweets_df
68
+ tweet_info = tweet_info[["id","tweet","date","user_id","urls" ,'nlikes', 'nreplies', 'nretweets']]
69
+ df = scraper.__get_only_tweets(tweet_info)
70
+ return df
71
+ def __check_date_type(d1,d2):
72
+ if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input isent string it generates exception
73
+ print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
74
+ raise EXCEPTION("Incorrect date type Exception!")
75
+ elif (len(d1.split("-")) or len(d2.split("-")))<2:
76
+ print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
77
+ raise EXCEPTION("Incorrect date type Exception!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twitter_scraper.ipynb CHANGED
@@ -10,7 +10,7 @@
10
  },
11
  {
12
  "cell_type": "code",
13
- "execution_count": null,
14
  "id": "c9021300",
15
  "metadata": {
16
  "scrolled": true
@@ -31,209 +31,456 @@
31
  },
32
  {
33
  "cell_type": "code",
34
- "execution_count": null,
35
  "id": "1413ab2b",
36
  "metadata": {},
37
  "outputs": [],
38
  "source": [
39
- "# import asyncio\n",
40
- "# import os\n",
41
- "# loop = asyncio.get_event_loop()\n",
42
- "# loop.is_running()\n",
43
- "# import twint\n",
44
- "# import nest_asyncio\n",
45
- "# nest_asyncio.apply()"
46
  ]
47
  },
48
  {
49
  "cell_type": "code",
50
- "execution_count": null,
51
  "id": "d38514f3",
52
  "metadata": {},
53
  "outputs": [],
54
  "source": [
55
- "import scrape\n"
56
- ]
57
- },
58
- {
59
- "cell_type": "code",
60
- "execution_count": null,
61
- "id": "a7912a91",
62
- "metadata": {},
63
- "outputs": [],
64
- "source": [
65
- "from_date=\"2022-6-10 10:30:22\"\n",
66
- "to_date= \"2022-6-30\"\n",
67
- "num_tweets = 20\n",
68
- "_data=scrape.scraper.get_tweets(\"jimmieakesson\",u_or_s=\"u\",from_date=221232,to_date=2313)\n"
69
- ]
70
- },
71
- {
72
- "cell_type": "code",
73
- "execution_count": null,
74
- "id": "48d50b46",
75
- "metadata": {},
76
- "outputs": [],
77
- "source": [
78
- "tweets= _data.keys()\n",
79
- "for i in tweets:\n",
80
- " _data[i][\"tweet\"]\n",
81
- " print(_data[i][\"tweet\"], \"\\n\", \"__________________________________________________________\")"
82
- ]
83
- },
84
- {
85
- "cell_type": "code",
86
- "execution_count": null,
87
- "id": "72cabcb5",
88
- "metadata": {},
89
- "outputs": [],
90
- "source": [
91
- "from_date=\"2022-6-10 10:30:22\"\n",
92
- "to_date= \"2022-6-30\"\n",
93
- "num_tweets = 20\n",
94
- "_data=scrape.scraper.string_search_user_tweets(\"jimmieakesson\",\"invandring\")\n"
95
- ]
96
- },
97
- {
98
- "cell_type": "code",
99
- "execution_count": null,
100
- "id": "549e4fb3",
101
- "metadata": {},
102
- "outputs": [],
103
- "source": [
104
- "tweets= _data[\"tweet\"]\n",
105
- "for i in tweets:\n",
106
- " print(i, \"\\n\", \"__________________________________________________________\")"
107
  ]
108
  },
109
  {
110
  "cell_type": "code",
111
- "execution_count": 3,
112
- "id": "733dd44a",
113
  "metadata": {},
114
  "outputs": [
115
  {
116
  "name": "stdout",
117
  "output_type": "stream",
118
  "text": [
119
- "Defaulting to user installation because normal site-packages is not writeable\n",
120
- "Requirement already satisfied: snscrape in /home/oxygen/.local/lib/python3.10/site-packages (0.3.4)\n",
121
- "Requirement already satisfied: beautifulsoup4 in /home/oxygen/.local/lib/python3.10/site-packages (from snscrape) (4.11.1)\n",
122
- "Requirement already satisfied: requests[socks] in /usr/lib/python3/dist-packages (from snscrape) (2.25.1)\n",
123
- "Requirement already satisfied: lxml in /usr/lib/python3/dist-packages (from snscrape) (4.8.0)\n",
124
- "Requirement already satisfied: soupsieve>1.2 in /home/oxygen/.local/lib/python3.10/site-packages (from beautifulsoup4->snscrape) (2.3.2.post1)\n",
125
- "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /home/oxygen/.local/lib/python3.10/site-packages (from requests[socks]->snscrape) (1.7.1)\n"
126
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  }
128
  ],
129
  "source": [
130
- "#%pip install -q snscrape==0.3.4\n",
131
- "!pip3 install snscrape\n",
132
- "#!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git"
133
  ]
134
  },
135
  {
136
  "cell_type": "code",
137
- "execution_count": 14,
138
- "id": "0d16422c",
139
  "metadata": {},
140
  "outputs": [
141
  {
142
  "name": "stdout",
143
  "output_type": "stream",
144
  "text": [
145
- "Note: you may need to restart the kernel to use updated packages.\n"
146
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  }
148
  ],
149
  "source": [
150
- "%pip install -q snscrape==0.3.4\n",
151
- "from datetime import date\n",
152
- "import os\n",
153
- "import pandas as pd\n",
154
- "\n",
155
- "\n",
156
- "def get_tweets(search_term, from_date, to_date=date.today(), num_tweets=100,u_or_s='s'):\n",
157
- " if u_or_s.lower() =='u':\n",
158
- " extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-user '{search_term} until:{to_date}' > extracted-tweets.txt\" \n",
159
- " else:\n",
160
- " extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-search '{search_term} until:{to_date}' > extracted-tweets.txt\"\n",
161
- " \n",
162
- " os.system(extracted_tweets)\n",
163
- " if os.stat(\"extracted-tweets.txt\").st_size == 0:\n",
164
- " print('No Tweets found')\n",
165
- " else:\n",
166
- " df = pd.read_csv('extracted-tweets.txt', names=['content'])\n",
167
- " data_list=[]\n",
168
- " for row in df['content'].iteritems():\n",
169
- " temp= str(row[0])+str(row[1])\n",
170
- " temp= temp.replace(\"\\'\",\"\")\n",
171
- " data_list.append(temp)\n",
172
- " return data_list\n",
173
- "\n"
174
  ]
175
  },
176
  {
177
  "cell_type": "code",
178
  "execution_count": 12,
179
- "id": "8e2adb35",
180
  "metadata": {},
181
  "outputs": [
182
  {
183
  "name": "stdout",
184
  "output_type": "stream",
185
  "text": [
186
- "No Tweets found\n"
187
- ]
188
- },
189
- {
190
- "name": "stderr",
191
- "output_type": "stream",
192
- "text": [
193
- "Traceback (most recent call last):\n",
194
- " File \"/home/oxygen/.local/bin/snscrape\", line 8, in <module>\n",
195
- " sys.exit(main())\n",
196
- " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 224, in main\n",
197
- " args = parse_args()\n",
198
- " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 159, in parse_args\n",
199
- " import snscrape.modules\n",
200
- " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 15, in <module>\n",
201
- " _import_modules()\n",
202
- " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 12, in _import_modules\n",
203
- " module = importlib.import_module(moduleName)\n",
204
- " File \"/usr/lib/python3.10/importlib/__init__.py\", line 126, in import_module\n",
205
- " return _bootstrap._gcd_import(name[level:], package, level)\n",
206
- " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/instagram.py\", line 12, in <module>\n",
207
- " class InstagramPost(typing.NamedTuple, snscrape.base.Item):\n",
208
- " File \"/usr/lib/python3.10/typing.py\", line 2329, in _namedtuple_mro_entries\n",
209
- " raise TypeError(\"Multiple inheritance with NamedTuple is not supported\")\n",
210
- "TypeError: Multiple inheritance with NamedTuple is not supported\n"
211
  ]
212
  },
213
  {
214
- "ename": "UnboundLocalError",
215
- "evalue": "local variable 'df' referenced before assignment",
216
- "output_type": "error",
217
- "traceback": [
218
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
219
- "\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
220
- "\u001b[0;32m/tmp/ipykernel_26511/1892081786.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0md\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mget_tweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"jimmieakesson\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfrom_date\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;34m\"2022-06-01\"\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0mnum_tweets\u001b[0m \u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu_or_s\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"u\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
221
- "\u001b[0;32m/tmp/ipykernel_26511/275462205.py\u001b[0m in \u001b[0;36mget_tweets\u001b[0;34m(search_term, from_date, to_date, num_tweets, u_or_s)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'extracted-tweets.txt'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mdata_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miteritems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 20\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\'\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
222
- "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'df' referenced before assignment"
223
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  }
225
  ],
226
  "source": [
227
- "d= get_tweets(\"jimmieakesson\",from_date= \"2022-06-01\" ,num_tweets =5, u_or_s=\"u\")"
 
228
  ]
229
  },
230
  {
231
  "cell_type": "code",
232
  "execution_count": null,
233
- "id": "a2c837f4",
234
  "metadata": {},
235
  "outputs": [],
236
- "source": []
 
 
 
 
 
237
  }
238
  ],
239
  "metadata": {
 
10
  },
11
  {
12
  "cell_type": "code",
13
+ "execution_count": 1,
14
  "id": "c9021300",
15
  "metadata": {
16
  "scrolled": true
 
31
  },
32
  {
33
  "cell_type": "code",
34
+ "execution_count": 2,
35
  "id": "1413ab2b",
36
  "metadata": {},
37
  "outputs": [],
38
  "source": [
39
+ "import asyncio\n",
40
+ "import os\n",
41
+ "loop = asyncio.get_event_loop()\n",
42
+ "loop.is_running()\n",
43
+ "import twint\n",
44
+ "import nest_asyncio\n",
45
+ "nest_asyncio.apply()"
46
  ]
47
  },
48
  {
49
  "cell_type": "code",
50
+ "execution_count": 6,
51
  "id": "d38514f3",
52
  "metadata": {},
53
  "outputs": [],
54
  "source": [
55
+ "import scrape\n",
56
+ "sc= scrape.scraper( from_date=\"2006-07-01\", to_date= \"2022-06-22\",num_tweets=100)\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  ]
58
  },
59
  {
60
  "cell_type": "code",
61
+ "execution_count": 8,
62
+ "id": "d37e5cbf",
63
  "metadata": {},
64
  "outputs": [
65
  {
66
  "name": "stdout",
67
  "output_type": "stream",
68
  "text": [
69
+ "[+] Finished: Successfully collected 100 Tweets.\n"
 
 
 
 
 
 
70
  ]
71
+ },
72
+ {
73
+ "data": {
74
+ "text/html": [
75
+ "<div>\n",
76
+ "<style scoped>\n",
77
+ " .dataframe tbody tr th:only-of-type {\n",
78
+ " vertical-align: middle;\n",
79
+ " }\n",
80
+ "\n",
81
+ " .dataframe tbody tr th {\n",
82
+ " vertical-align: top;\n",
83
+ " }\n",
84
+ "\n",
85
+ " .dataframe thead th {\n",
86
+ " text-align: right;\n",
87
+ " }\n",
88
+ "</style>\n",
89
+ "<table border=\"1\" class=\"dataframe\">\n",
90
+ " <thead>\n",
91
+ " <tr style=\"text-align: right;\">\n",
92
+ " <th></th>\n",
93
+ " <th>id</th>\n",
94
+ " <th>tweet</th>\n",
95
+ " <th>date</th>\n",
96
+ " <th>user_id</th>\n",
97
+ " <th>urls</th>\n",
98
+ " <th>nlikes</th>\n",
99
+ " <th>nreplies</th>\n",
100
+ " <th>nretweets</th>\n",
101
+ " </tr>\n",
102
+ " </thead>\n",
103
+ " <tbody>\n",
104
+ " <tr>\n",
105
+ " <th>0</th>\n",
106
+ " <td>1539394015560359944</td>\n",
107
+ " <td>wAllah comme si on avais pas d’autre choses j’...</td>\n",
108
+ " <td>2022-06-22 01:45:08</td>\n",
109
+ " <td>1202681666487115776</td>\n",
110
+ " <td>[]</td>\n",
111
+ " <td>3</td>\n",
112
+ " <td>0</td>\n",
113
+ " <td>0</td>\n",
114
+ " </tr>\n",
115
+ " <tr>\n",
116
+ " <th>5</th>\n",
117
+ " <td>1539387277960433664</td>\n",
118
+ " <td>Şev baş temaşevanen heja https://t.co/aqw5vNPLFr</td>\n",
119
+ " <td>2022-06-22 01:18:22</td>\n",
120
+ " <td>743954955220979713</td>\n",
121
+ " <td>[https://youtu.be/n_vYzgRBFUI]</td>\n",
122
+ " <td>0</td>\n",
123
+ " <td>0</td>\n",
124
+ " <td>0</td>\n",
125
+ " </tr>\n",
126
+ " <tr>\n",
127
+ " <th>7</th>\n",
128
+ " <td>1539386040313851904</td>\n",
129
+ " <td>Heja!=頑張れ!</td>\n",
130
+ " <td>2022-06-22 01:13:27</td>\n",
131
+ " <td>176860217</td>\n",
132
+ " <td>[]</td>\n",
133
+ " <td>0</td>\n",
134
+ " <td>0</td>\n",
135
+ " <td>0</td>\n",
136
+ " </tr>\n",
137
+ " <tr>\n",
138
+ " <th>9</th>\n",
139
+ " <td>1539379141597925377</td>\n",
140
+ " <td>skończyłam po 15h naukę na dziś ❤️😋 wrócę po u...</td>\n",
141
+ " <td>2022-06-22 00:46:02</td>\n",
142
+ " <td>840668853948559360</td>\n",
143
+ " <td>[]</td>\n",
144
+ " <td>0</td>\n",
145
+ " <td>0</td>\n",
146
+ " <td>0</td>\n",
147
+ " </tr>\n",
148
+ " <tr>\n",
149
+ " <th>10</th>\n",
150
+ " <td>1539377784707026945</td>\n",
151
+ " <td>je suis en train de siroter mon candy up frche...</td>\n",
152
+ " <td>2022-06-22 00:40:38</td>\n",
153
+ " <td>980874157998137345</td>\n",
154
+ " <td>[]</td>\n",
155
+ " <td>3</td>\n",
156
+ " <td>0</td>\n",
157
+ " <td>0</td>\n",
158
+ " </tr>\n",
159
+ " </tbody>\n",
160
+ "</table>\n",
161
+ "</div>"
162
+ ],
163
+ "text/plain": [
164
+ " id tweet \\\n",
165
+ "0 1539394015560359944 wAllah comme si on avais pas d’autre choses j’... \n",
166
+ "5 1539387277960433664 Şev baş temaşevanen heja https://t.co/aqw5vNPLFr \n",
167
+ "7 1539386040313851904 Heja!=頑張れ! \n",
168
+ "9 1539379141597925377 skończyłam po 15h naukę na dziś ❤️😋 wrócę po u... \n",
169
+ "10 1539377784707026945 je suis en train de siroter mon candy up frche... \n",
170
+ "\n",
171
+ " date user_id urls \\\n",
172
+ "0 2022-06-22 01:45:08 1202681666487115776 [] \n",
173
+ "5 2022-06-22 01:18:22 743954955220979713 [https://youtu.be/n_vYzgRBFUI] \n",
174
+ "7 2022-06-22 01:13:27 176860217 [] \n",
175
+ "9 2022-06-22 00:46:02 840668853948559360 [] \n",
176
+ "10 2022-06-22 00:40:38 980874157998137345 [] \n",
177
+ "\n",
178
+ " nlikes nreplies nretweets \n",
179
+ "0 3 0 0 \n",
180
+ "5 0 0 0 \n",
181
+ "7 0 0 0 \n",
182
+ "9 0 0 0 \n",
183
+ "10 3 0 0 "
184
+ ]
185
+ },
186
+ "execution_count": 8,
187
+ "metadata": {},
188
+ "output_type": "execute_result"
189
  }
190
  ],
191
  "source": [
192
+ "d=sc.scrape_by_string(\"heja\")\n",
193
+ "d.head()"
 
194
  ]
195
  },
196
  {
197
  "cell_type": "code",
198
+ "execution_count": 11,
199
+ "id": "a7912a91",
200
  "metadata": {},
201
  "outputs": [
202
  {
203
  "name": "stdout",
204
  "output_type": "stream",
205
  "text": [
206
+ "[+] Finished: Successfully collected 100 Tweets.\n"
207
  ]
208
+ },
209
+ {
210
+ "data": {
211
+ "text/html": [
212
+ "<div>\n",
213
+ "<style scoped>\n",
214
+ " .dataframe tbody tr th:only-of-type {\n",
215
+ " vertical-align: middle;\n",
216
+ " }\n",
217
+ "\n",
218
+ " .dataframe tbody tr th {\n",
219
+ " vertical-align: top;\n",
220
+ " }\n",
221
+ "\n",
222
+ " .dataframe thead th {\n",
223
+ " text-align: right;\n",
224
+ " }\n",
225
+ "</style>\n",
226
+ "<table border=\"1\" class=\"dataframe\">\n",
227
+ " <thead>\n",
228
+ " <tr style=\"text-align: right;\">\n",
229
+ " <th></th>\n",
230
+ " <th>id</th>\n",
231
+ " <th>tweet</th>\n",
232
+ " <th>date</th>\n",
233
+ " <th>user_id</th>\n",
234
+ " <th>urls</th>\n",
235
+ " <th>nlikes</th>\n",
236
+ " <th>nreplies</th>\n",
237
+ " <th>nretweets</th>\n",
238
+ " </tr>\n",
239
+ " </thead>\n",
240
+ " <tbody>\n",
241
+ " <tr>\n",
242
+ " <th>1</th>\n",
243
+ " <td>1537770920621879297</td>\n",
244
+ " <td>Man kan ha synpunkter på en sådan lösning, men...</td>\n",
245
+ " <td>2022-06-17 14:15:32</td>\n",
246
+ " <td>95972673</td>\n",
247
+ " <td>[]</td>\n",
248
+ " <td>692</td>\n",
249
+ " <td>17</td>\n",
250
+ " <td>41</td>\n",
251
+ " </tr>\n",
252
+ " <tr>\n",
253
+ " <th>2</th>\n",
254
+ " <td>1537770809225273344</td>\n",
255
+ " <td>Är det ont om plats på anstalterna så får man ...</td>\n",
256
+ " <td>2022-06-17 14:15:05</td>\n",
257
+ " <td>95972673</td>\n",
258
+ " <td>[]</td>\n",
259
+ " <td>809</td>\n",
260
+ " <td>26</td>\n",
261
+ " <td>57</td>\n",
262
+ " </tr>\n",
263
+ " <tr>\n",
264
+ " <th>3</th>\n",
265
+ " <td>1537770713368735744</td>\n",
266
+ " <td>Döms man för brott, särskilt våldsbrott, ska m...</td>\n",
267
+ " <td>2022-06-17 14:14:43</td>\n",
268
+ " <td>95972673</td>\n",
269
+ " <td>[]</td>\n",
270
+ " <td>1020</td>\n",
271
+ " <td>26</td>\n",
272
+ " <td>86</td>\n",
273
+ " </tr>\n",
274
+ " <tr>\n",
275
+ " <th>4</th>\n",
276
+ " <td>1537770657823576066</td>\n",
277
+ " <td>Platsbrist? Jaha, vad spelar det för roll? D...</td>\n",
278
+ " <td>2022-06-17 14:14:29</td>\n",
279
+ " <td>95972673</td>\n",
280
+ " <td>[https://sverigesradio.se/artikel/domda-kvinno...</td>\n",
281
+ " <td>1152</td>\n",
282
+ " <td>85</td>\n",
283
+ " <td>132</td>\n",
284
+ " </tr>\n",
285
+ " <tr>\n",
286
+ " <th>5</th>\n",
287
+ " <td>1534230353094885383</td>\n",
288
+ " <td>Det är ytterst beklagligt att Magdalena Anders...</td>\n",
289
+ " <td>2022-06-07 19:46:35</td>\n",
290
+ " <td>95972673</td>\n",
291
+ " <td>[]</td>\n",
292
+ " <td>6121</td>\n",
293
+ " <td>546</td>\n",
294
+ " <td>557</td>\n",
295
+ " </tr>\n",
296
+ " </tbody>\n",
297
+ "</table>\n",
298
+ "</div>"
299
+ ],
300
+ "text/plain": [
301
+ " id tweet \\\n",
302
+ "1 1537770920621879297 Man kan ha synpunkter på en sådan lösning, men... \n",
303
+ "2 1537770809225273344 Är det ont om plats på anstalterna så får man ... \n",
304
+ "3 1537770713368735744 Döms man för brott, särskilt våldsbrott, ska m... \n",
305
+ "4 1537770657823576066 Platsbrist? Jaha, vad spelar det för roll? D... \n",
306
+ "5 1534230353094885383 Det är ytterst beklagligt att Magdalena Anders... \n",
307
+ "\n",
308
+ " date user_id \\\n",
309
+ "1 2022-06-17 14:15:32 95972673 \n",
310
+ "2 2022-06-17 14:15:05 95972673 \n",
311
+ "3 2022-06-17 14:14:43 95972673 \n",
312
+ "4 2022-06-17 14:14:29 95972673 \n",
313
+ "5 2022-06-07 19:46:35 95972673 \n",
314
+ "\n",
315
+ " urls nlikes nreplies \\\n",
316
+ "1 [] 692 17 \n",
317
+ "2 [] 809 26 \n",
318
+ "3 [] 1020 26 \n",
319
+ "4 [https://sverigesradio.se/artikel/domda-kvinno... 1152 85 \n",
320
+ "5 [] 6121 546 \n",
321
+ "\n",
322
+ " nretweets \n",
323
+ "1 41 \n",
324
+ "2 57 \n",
325
+ "3 86 \n",
326
+ "4 132 \n",
327
+ "5 557 "
328
+ ]
329
+ },
330
+ "execution_count": 11,
331
+ "metadata": {},
332
+ "output_type": "execute_result"
333
  }
334
  ],
335
  "source": [
336
+ "df=sc.scrape_by_user(\"jimmieakesson\")\n",
337
+ "df.head()"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  ]
339
  },
340
  {
341
  "cell_type": "code",
342
  "execution_count": 12,
343
+ "id": "7db69757",
344
  "metadata": {},
345
  "outputs": [
346
  {
347
  "name": "stdout",
348
  "output_type": "stream",
349
  "text": [
350
+ "[!] No more data! Scraping will stop now.\n",
351
+ "found 0 deleted tweets in this search.\n",
352
+ "[+] Finished: Successfully collected 16 Tweets from @jimmieakesson.\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  ]
354
  },
355
  {
356
+ "data": {
357
+ "text/html": [
358
+ "<div>\n",
359
+ "<style scoped>\n",
360
+ " .dataframe tbody tr th:only-of-type {\n",
361
+ " vertical-align: middle;\n",
362
+ " }\n",
363
+ "\n",
364
+ " .dataframe tbody tr th {\n",
365
+ " vertical-align: top;\n",
366
+ " }\n",
367
+ "\n",
368
+ " .dataframe thead th {\n",
369
+ " text-align: right;\n",
370
+ " }\n",
371
+ "</style>\n",
372
+ "<table border=\"1\" class=\"dataframe\">\n",
373
+ " <thead>\n",
374
+ " <tr style=\"text-align: right;\">\n",
375
+ " <th></th>\n",
376
+ " <th>id</th>\n",
377
+ " <th>tweet</th>\n",
378
+ " <th>date</th>\n",
379
+ " <th>user_id</th>\n",
380
+ " <th>urls</th>\n",
381
+ " <th>nlikes</th>\n",
382
+ " <th>nreplies</th>\n",
383
+ " <th>nretweets</th>\n",
384
+ " </tr>\n",
385
+ " </thead>\n",
386
+ " <tbody>\n",
387
+ " <tr>\n",
388
+ " <th>0</th>\n",
389
+ " <td>1363067834260201475</td>\n",
390
+ " <td>Utan massiv, asylrelaterad invandring från frä...</td>\n",
391
+ " <td>2021-02-20 11:07:50</td>\n",
392
+ " <td>95972673</td>\n",
393
+ " <td>[]</td>\n",
394
+ " <td>1277</td>\n",
395
+ " <td>22</td>\n",
396
+ " <td>105</td>\n",
397
+ " </tr>\n",
398
+ " <tr>\n",
399
+ " <th>1</th>\n",
400
+ " <td>1363067613660778496</td>\n",
401
+ " <td>Många vänsterliberaler tycks ha reagerat på de...</td>\n",
402
+ " <td>2021-02-20 11:06:58</td>\n",
403
+ " <td>95972673</td>\n",
404
+ " <td>[]</td>\n",
405
+ " <td>626</td>\n",
406
+ " <td>9</td>\n",
407
+ " <td>68</td>\n",
408
+ " </tr>\n",
409
+ " <tr>\n",
410
+ " <th>2</th>\n",
411
+ " <td>1363067558409158656</td>\n",
412
+ " <td>Jag förstår — uppriktigt — inte den närmast hy...</td>\n",
413
+ " <td>2021-02-20 11:06:45</td>\n",
414
+ " <td>95972673</td>\n",
415
+ " <td>[]</td>\n",
416
+ " <td>2459</td>\n",
417
+ " <td>199</td>\n",
418
+ " <td>336</td>\n",
419
+ " </tr>\n",
420
+ " <tr>\n",
421
+ " <th>3</th>\n",
422
+ " <td>1362748777552113670</td>\n",
423
+ " <td>Invandring av hundratusentals människor från f...</td>\n",
424
+ " <td>2021-02-19 14:00:01</td>\n",
425
+ " <td>95972673</td>\n",
426
+ " <td>[]</td>\n",
427
+ " <td>1334</td>\n",
428
+ " <td>55</td>\n",
429
+ " <td>101</td>\n",
430
+ " </tr>\n",
431
+ " <tr>\n",
432
+ " <th>4</th>\n",
433
+ " <td>1362409505557012490</td>\n",
434
+ " <td>Vårt land behöver ett totalstopp för all asyl-...</td>\n",
435
+ " <td>2021-02-18 15:31:53</td>\n",
436
+ " <td>95972673</td>\n",
437
+ " <td>[]</td>\n",
438
+ " <td>3044</td>\n",
439
+ " <td>268</td>\n",
440
+ " <td>404</td>\n",
441
+ " </tr>\n",
442
+ " </tbody>\n",
443
+ "</table>\n",
444
+ "</div>"
445
+ ],
446
+ "text/plain": [
447
+ " id tweet \\\n",
448
+ "0 1363067834260201475 Utan massiv, asylrelaterad invandring från frä... \n",
449
+ "1 1363067613660778496 Många vänsterliberaler tycks ha reagerat på de... \n",
450
+ "2 1363067558409158656 Jag förstår — uppriktigt — inte den närmast hy... \n",
451
+ "3 1362748777552113670 Invandring av hundratusentals människor från f... \n",
452
+ "4 1362409505557012490 Vårt land behöver ett totalstopp för all asyl-... \n",
453
+ "\n",
454
+ " date user_id urls nlikes nreplies nretweets \n",
455
+ "0 2021-02-20 11:07:50 95972673 [] 1277 22 105 \n",
456
+ "1 2021-02-20 11:06:58 95972673 [] 626 9 68 \n",
457
+ "2 2021-02-20 11:06:45 95972673 [] 2459 199 336 \n",
458
+ "3 2021-02-19 14:00:01 95972673 [] 1334 55 101 \n",
459
+ "4 2021-02-18 15:31:53 95972673 [] 3044 268 404 "
460
+ ]
461
+ },
462
+ "execution_count": 12,
463
+ "metadata": {},
464
+ "output_type": "execute_result"
465
  }
466
  ],
467
  "source": [
468
+ "df=sc.scrape_by_user_and_string(\"jimmieakesson\",\"invandring\")\n",
469
+ "df.head()"
470
  ]
471
  },
472
  {
473
  "cell_type": "code",
474
  "execution_count": null,
475
+ "id": "48d50b46",
476
  "metadata": {},
477
  "outputs": [],
478
+ "source": [
479
+ "tweets= _data.keys()\n",
480
+ "for i in tweets:\n",
481
+ " _data[i][\"tweet\"]\n",
482
+ " print(_data[i][\"tweet\"], \"\\n\", \"__________________________________________________________\")"
483
+ ]
484
  }
485
  ],
486
  "metadata": {
twitter_scraper/twint_master/elasticsearch/dashboard.json DELETED
@@ -1,18 +0,0 @@
1
- [
2
- {
3
- "_id": "e6d65380-bfe2-11e8-961a-d371b24d5d1d",
4
- "_type": "dashboard",
5
- "_source": {
6
- "title": "Twint Dashboard",
7
- "hits": 0,
8
- "description": "",
9
- "panelsJSON": "[{\"panelIndex\":\"1\",\"gridData\":{\"x\":0,\"y\":0,\"w\":40,\"h\":17,\"i\":\"1\"},\"embeddableConfig\":{},\"id\":\"d47421c0-bfd5-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":40,\"y\":6,\"w\":8,\"h\":11,\"i\":\"2\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"e2b89640-bfd4-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":32,\"w\":20,\"h\":17,\"i\":\"3\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"8a8bb420-bfd9-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":17,\"w\":33,\"h\":15,\"i\":\"4\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"a8d3ee70-bfd9-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":40,\"y\":0,\"w\":8,\"h\":6,\"i\":\"6\"},\"embeddableConfig\":{},\"id\":\"37cd72e0-bfe4-11e8-961a-d371b24d5d1d\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":33,\"y\":17,\"w\":15,\"h\":15,\"i\":\"7\"},\"embeddableConfig\":{},\"id\":\"149ecbc0-bfe4-11e8-961a-d371b24d5d1d\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"8\",\"gridData\":{\"x\":20,\"y\":32,\"w\":28,\"h\":17,\"i\":\"8\"},\"version\":\"6.3.2\",\"type\":\"visualization\",\"id\":\"b45ec590-c267-11e8-bcd4-3956fe930db7\",\"embeddableConfig\":{}}]",
10
- "optionsJSON": "{\"darkTheme\":true,\"hidePanelTitles\":true,\"useMargins\":true}",
11
- "version": 1,
12
- "timeRestore": false,
13
- "kibanaSavedObjectMeta": {
14
- "searchSourceJSON": "{\"query\":{\"language\":\"lucene\",\"query\":\"\"},\"filter\":[],\"highlightAll\":true,\"version\":true}"
15
- }
16
- }
17
- }
18
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter_scraper/twint_master/elasticsearch/index-follow.json DELETED
@@ -1,15 +0,0 @@
1
- PUT twintgraph
2
- {
3
- "mappings": {
4
- "items": {
5
- "properties": {
6
- "user": {"type": "keyword"},
7
- "follow": {"type": "keyword"},
8
- "essid": {"type": "keyword"}
9
- }
10
- }
11
- },
12
- "settings": {
13
- "number_of_shards": 1
14
- }
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter_scraper/twint_master/elasticsearch/index-tweets.json DELETED
@@ -1,48 +0,0 @@
1
- PUT twinttweets
2
- {
3
- "mappings": {
4
- "items": {
5
- "properties": {
6
- "id": {"type": "long"},
7
- "conversation_id": {"type": "long"},
8
- "created_at": {"type": "long"},
9
- "date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
10
- "timezone": {"type": "keyword"},
11
- "place": {"type": "keyword"},
12
- "location": {"type": "keyword"},
13
- "tweet": {"type": "text"},
14
- "hashtags": {"type": "keyword"},
15
- "cashtags": {"type": "keyword"},
16
- "user_id": {"type": "long"},
17
- "user_id_str": {"type": "keyword"},
18
- "username": {"type": "keyword"},
19
- "name": {"type": "text"},
20
- "profile_image_url": {"type": "text"},
21
- "day": {"type": "integer"},
22
- "hour": {"type": "integer"},
23
- "link": {"type": "text"},
24
- "retweet": {"type": "text"},
25
- "essid": {"type": "keyword"},
26
- "nlikes": {"type": "integer"},
27
- "nreplies": {"type": "integer"},
28
- "nretweets": {"type": "integer"},
29
- "quote_url": {"type": "text"},
30
- "video": {"type": "integer"},
31
- "thumbnail": {"type": "text"},
32
- "search": {"type": "text"},
33
- "near": {"type": "text"},
34
- "geo_near": {"type": "geo_point"},
35
- "geo_tweet": {"type": "geo_point"},
36
- "photos": {"type": "text"},
37
- "mentions": {"type": "text"},
38
- "translation": {"type": "text"},
39
- "trans_src": {"type": "keyword"},
40
- "trans_dev": {"type": "keyword"},
41
- }
42
- }
43
- }
44
- ,
45
- "settings": {
46
- "number_of_shards": 1
47
- }
48
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter_scraper/twint_master/elasticsearch/index-user.json DELETED
@@ -1,33 +0,0 @@
1
- PUT twintuser
2
- {
3
- "mappings": {
4
- "items": {
5
- "properties": {
6
- "id": {"type": "keyword"},
7
- "name": {"type": "keyword"},
8
- "username": {"type": "keyword"},
9
- "bio": {"type": "text"},
10
- "location": {"type": "keyword"},
11
- "url": {"type": "text"},
12
- "join_datetime": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
13
- "join_date": {"type": "date", "format": "yyyy-MM-dd"},
14
- "join_time": {"type": "date", "format": "HH:mm:ss"},
15
- "tweets": {"type": "integer"},
16
- "following": {"type": "integer"},
17
- "followers": {"type": "integer"},
18
- "likes": {"type": "integer"},
19
- "media": {"type": "integer"},
20
- "private": {"type": "integer"},
21
- "verified": {"type": "integer"},
22
- "avatar": {"type": "text"},
23
- "background_image": {"type": "text"},
24
- "session": {"type": "keyword"},
25
- "geo_user": {"type": "geo_point"}
26
- }
27
- }
28
- }
29
- ,
30
- "settings": {
31
- "number_of_shards": 1
32
- }
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter_scraper/twint_master/elasticsearch/visualizations.json DELETED
@@ -1,100 +0,0 @@
1
- [
2
- {
3
- "_id": "d47421c0-bfd5-11e8-8858-bbc566841533",
4
- "_type": "visualization",
5
- "_source": {
6
- "title": "Activity [twinttweets]",
7
- "visState": "{\"title\":\"Activity [twinttweets]\",\"type\":\"histogram\",\"params\":{\"type\":\"histogram\",\"grid\":{\"categoryLines\":true,\"style\":{\"color\":\"#eee\"},\"valueAxis\":\"ValueAxis-1\"},\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"type\":\"category\",\"position\":\"bottom\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\"},\"labels\":{\"show\":true,\"truncate\":100},\"title\":{}}],\"valueAxes\":[{\"id\":\"ValueAxis-1\",\"name\":\"LeftAxis-1\",\"type\":\"value\",\"position\":\"left\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\",\"mode\":\"normal\"},\"labels\":{\"show\":true,\"rotate\":0,\"filter\":false,\"truncate\":100},\"title\":{\"text\":\"Tweets\"}}],\"seriesParams\":[{\"show\":\"true\",\"type\":\"area\",\"mode\":\"stacked\",\"data\":{\"label\":\"Tweets\",\"id\":\"1\"},\"valueAxis\":\"ValueAxis-1\",\"drawLinesBetweenPoints\":true,\"showCircles\":true,\"interpolate\":\"cardinal\"}],\"addTooltip\":true,\"addLegend\":true,\"legendPosition\":\"right\",\"times\":[],\"addTimeMarker\":true},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}},{\"id\":\"2\",\"enabled\":true,\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"date\",\"interval\":\"auto\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{},\"customLabel\":\"Days\"}},{\"id\":\"3\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"user_id\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"customLabel\":\"User ids\"}}]}",
8
- "uiStateJSON": "{}",
9
- "description": "",
10
- "version": 1,
11
- "kibanaSavedObjectMeta": {
12
- "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\",\"language\":\"lucene\"},\"filter\":[]}"
13
- }
14
- }
15
- },
16
- {
17
- "_id": "e2b89640-bfd4-11e8-8858-bbc566841533",
18
- "_type": "visualization",
19
- "_source": {
20
- "title": "Activity - pie [twinttweets]",
21
- "visState": "{\"aggs\":[{\"enabled\":true,\"id\":\"1\",\"params\":{},\"schema\":\"metric\",\"type\":\"count\"},{\"enabled\":true,\"id\":\"2\",\"params\":{\"field\":\"user_id\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"order\":\"desc\",\"orderBy\":\"1\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"size\":5},\"schema\":\"segment\",\"type\":\"terms\"}],\"params\":{\"addLegend\":true,\"addTooltip\":true,\"isDonut\":true,\"labels\":{\"last_level\":true,\"show\":false,\"truncate\":100,\"values\":true},\"legendPosition\":\"right\",\"type\":\"pie\"},\"title\":\"Activity - pie [twinttweets]\",\"type\":\"pie\"}",
22
- "uiStateJSON": "{}",
23
- "description": "",
24
- "version": 1,
25
- "kibanaSavedObjectMeta": {
26
- "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"language\":\"lucene\",\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\"},\"filter\":[]}"
27
- }
28
- }
29
- },
30
- {
31
- "_id": "37cd72e0-bfe4-11e8-961a-d371b24d5d1d",
32
- "_type": "visualization",
33
- "_source": {
34
- "title": "Tweets Count [twinttweet]",
35
- "visState": "{\"title\":\"Tweets Count [twinttweet]\",\"type\":\"metric\",\"params\":{\"addTooltip\":true,\"addLegend\":false,\"type\":\"metric\",\"metric\":{\"percentageMode\":false,\"useRanges\":false,\"colorSchema\":\"Green to Red\",\"metricColorMode\":\"None\",\"colorsRange\":[{\"from\":0,\"to\":10000}],\"labels\":{\"show\":true},\"invertColors\":false,\"style\":{\"bgFill\":\"#000\",\"bgColor\":false,\"labelColor\":false,\"subText\":\"\",\"fontSize\":33}}},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}}]}",
36
- "uiStateJSON": "{}",
37
- "description": "",
38
- "version": 1,
39
- "kibanaSavedObjectMeta": {
40
- "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"language\":\"lucene\",\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\"},\"filter\":[]}"
41
- }
42
- }
43
- },
44
- {
45
- "_id": "149ecbc0-bfe4-11e8-961a-d371b24d5d1d",
46
- "_type": "visualization",
47
- "_source": {
48
- "title": "Word Cloud [twinttweets]",
49
- "visState": "{\"title\":\"Word Cloud [twinttweets]\",\"type\":\"tagcloud\",\"params\":{\"scale\":\"linear\",\"orientation\":\"single\",\"minFontSize\":10,\"maxFontSize\":50,\"showLabel\":false},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"username\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}}]}",
50
- "uiStateJSON": "{}",
51
- "description": "",
52
- "version": 1,
53
- "kibanaSavedObjectMeta": {
54
- "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\",\"language\":\"lucene\"},\"filter\":[]}"
55
- }
56
- }
57
- },
58
- {
59
- "_id": "a8d3ee70-bfd9-11e8-8858-bbc566841533",
60
- "_type": "visualization",
61
- "_source": {
62
- "title": "Day-activity [twinttweet]",
63
- "visState": "{\"title\":\"Day-activity [twinttweet]\",\"type\":\"histogram\",\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"labels\":{\"show\":true,\"truncate\":100,\"rotate\":0},\"position\":\"bottom\",\"scale\":{\"type\":\"linear\"},\"show\":true,\"style\":{},\"title\":{},\"type\":\"category\"}],\"grid\":{\"categoryLines\":true,\"style\":{\"color\":\"#eee\"},\"valueAxis\":\"ValueAxis-3\"},\"legendPosition\":\"right\",\"orderBucketsBySum\":false,\"seriesParams\":[{\"data\":{\"id\":\"1\",\"label\":\"Tweets\"},\"drawLinesBetweenPoints\":true,\"mode\":\"normal\",\"show\":\"true\",\"showCircles\":true,\"type\":\"histogram\",\"valueAxis\":\"ValueAxis-3\"}],\"times\":[],\"type\":\"histogram\",\"valueAxes\":[{\"id\":\"ValueAxis-3\",\"labels\":{\"filter\":false,\"rotate\":0,\"show\":true,\"truncate\":100},\"name\":\"LeftAxis-1\",\"position\":\"left\",\"scale\":{\"mode\":\"normal\",\"type\":\"linear\"},\"show\":true,\"style\":{},\"title\":{\"text\":\"Tweets\"},\"type\":\"value\"}]},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}},{\"id\":\"2\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"hour\",\"interval\":1,\"min_doc_count\":true,\"extended_bounds\":{\"min\":0,\"max\":23}}},{\"id\":\"3\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"user_id\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"size\":10,\"order\":\"asc\",\"orderBy\":\"_term\",\"customLabel\":\"\"}}]}",
64
- "uiStateJSON": "{\"vis\":{\"legendOpen\":true}}",
65
- "description": "",
66
- "version": 1,
67
- "kibanaSavedObjectMeta": {
68
- "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"language\":\"lucene\",\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\"},\"filter\":[]}"
69
- }
70
- }
71
- },
72
- {
73
- "_id": "8a8bb420-bfd9-11e8-8858-bbc566841533",
74
- "_type": "visualization",
75
- "_source": {
76
- "title": "Week-activity [twinttweet]",
77
- "visState": "{\"title\":\"Week-activity [twinttweet]\",\"type\":\"histogram\",\"params\":{\"type\":\"histogram\",\"grid\":{\"categoryLines\":true,\"style\":{\"color\":\"#eee\"},\"valueAxis\":\"ValueAxis-1\"},\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"type\":\"category\",\"position\":\"bottom\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\"},\"labels\":{\"show\":true,\"truncate\":100,\"rotate\":0},\"title\":{}}],\"valueAxes\":[{\"id\":\"ValueAxis-1\",\"name\":\"LeftAxis-1\",\"type\":\"value\",\"position\":\"left\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\",\"mode\":\"normal\"},\"labels\":{\"show\":true,\"rotate\":0,\"filter\":false,\"truncate\":100},\"title\":{\"text\":\"Tweets\"}}],\"seriesParams\":[{\"show\":\"true\",\"type\":\"histogram\",\"mode\":\"normal\",\"data\":{\"label\":\"Tweets\",\"id\":\"1\"},\"valueAxis\":\"ValueAxis-1\",\"drawLinesBetweenPoints\":true,\"showCircles\":true}],\"addTooltip\":true,\"addLegend\":true,\"legendPosition\":\"right\",\"times\":[],\"addTimeMarker\":false},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}},{\"id\":\"2\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"day\",\"interval\":1,\"min_doc_count\":true,\"extended_bounds\":{},\"customLabel\":\"Days of the week\"}},{\"id\":\"3\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"user_id\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\",\"customLabel\":\"\"}}]}",
78
- "uiStateJSON": "{}",
79
- "description": "",
80
- "version": 1,
81
- "kibanaSavedObjectMeta": {
82
- "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\",\"language\":\"lucene\"},\"filter\":[]}"
83
- }
84
- }
85
- },
86
- {
87
- "_id": "b45ec590-c267-11e8-bcd4-3956fe930db7",
88
- "_type": "visualization",
89
- "_source": {
90
- "title": "Heat-map [twinttweets]",
91
- "visState": "{\"title\":\"Heat-map [twinttweets]\",\"type\":\"heatmap\",\"params\":{\"type\":\"heatmap\",\"addTooltip\":true,\"addLegend\":true,\"enableHover\":true,\"legendPosition\":\"right\",\"times\":[],\"colorsNumber\":10,\"colorSchema\":\"Reds\",\"setColorRange\":false,\"colorsRange\":[{\"from\":0,\"to\":10},{\"from\":10,\"to\":100},{\"from\":100,\"to\":200},{\"from\":200,\"to\":500},{\"from\":500,\"to\":1000},{\"from\":1000,\"to\":2000},{\"from\":2000,\"to\":3000},{\"from\":3000,\"to\":4000},{\"from\":4000,\"to\":5000},{\"from\":7000,\"to\":null}],\"invertColors\":false,\"percentageMode\":false,\"valueAxes\":[{\"show\":false,\"id\":\"ValueAxis-1\",\"type\":\"value\",\"scale\":{\"type\":\"linear\",\"defaultYExtents\":true},\"labels\":{\"show\":false,\"rotate\":270,\"overwriteColor\":false,\"color\":\"#555\"}}]},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"hour\",\"interval\":1,\"min_doc_count\":false,\"extended_bounds\":{}}},{\"id\":\"3\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"group\",\"params\":{\"field\":\"day\",\"interval\":1,\"min_doc_count\":false,\"extended_bounds\":{\"min\":0,\"max\":2}}}]}",
92
- "uiStateJSON": "{\"vis\":{\"defaultColors\":{\"3 - 592\":\"rgb(255,245,240)\",\"592 - 1.180\":\"rgb(254,228,216)\",\"1.180 - 1.769\":\"rgb(253,202,181)\",\"1.769 - 2.357\":\"rgb(252,171,142)\",\"2.357 - 2.945\":\"rgb(252,138,106)\",\"2.945 - 3.534\":\"rgb(251,106,74)\",\"3.534 - 4.122\":\"rgb(241,68,50)\",\"4.122 - 4.711\":\"rgb(217,38,35)\",\"4.711 - 5.299\":\"rgb(188,20,26)\",\"5.299 - 5.887\":\"rgb(152,12,19)\"},\"colors\":{\"3 - 592\":\"#FCEACA\",\"592 - 1.180\":\"#F9E2D2\",\"1.180 - 1.769\":\"#F9BA8F\"}}}",
93
- "description": "",
94
- "version": 1,
95
- "kibanaSavedObjectMeta": {
96
- "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"filter\":[],\"query\":{\"language\":\"lucene\",\"query\":\"\"}}"
97
- }
98
- }
99
- }
100
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter_scraper/twint_master/extracted-tweets.txt DELETED
@@ -1,5 +0,0 @@
1
- '@annieloof Nej, jag håller med. Tänk mer som Mathias Andersson (SD). https://t.co/gSqQDz5N8z'
2
- 'Man kan ha synpunkter på en sådan lösning, men den är naturligtvis att föredra framför frigående våldsverkare som fortsätter misshandla sina offer i väntan på fängelse.'
3
- 'Är det ont om plats på anstalterna så får man sänka standarden rejält för att få rum med fler interner per kvadratmeter.'
4
- 'Döms man för brott, särskilt våldsbrott, ska man vara inlåst från det att domen faller tills straffet är avtjänat. Allt annat är vansinne.'
5
- 'Platsbrist? Jaha, vad spelar det för roll? \n\nDet gör mig förbannad och bestört att lösningen på problemet med överfulla fängelser verkar vara att dömda våldsbrottslingar får röra sig fritt i samhället istället för att sitta inlåsta. \n\nhttps://t.co/QDi9rM3kMC'
 
 
 
 
 
 
twitter_scraper/twint_master/requirements.txt DELETED
@@ -1,13 +0,0 @@
1
- aiohttp
2
- aiodns
3
- beautifulsoup4
4
- cchardet
5
- dataclasses
6
- elasticsearch
7
- pysocks
8
- pandas>=0.23.0
9
- aiohttp_socks<=0.4.1
10
- schedule
11
- geopy
12
- fake-useragent
13
- googletransx