Spaces:
Runtime error
Runtime error
Mosa
commited on
Commit
•
a2b888f
1
Parent(s):
2981ede
here is my changes
Browse files
twitter-scraper/twint-master/twitter_scraper.ipynb
CHANGED
@@ -10,7 +10,7 @@
|
|
10 |
},
|
11 |
{
|
12 |
"cell_type": "code",
|
13 |
-
"execution_count":
|
14 |
"id": "c9021300",
|
15 |
"metadata": {
|
16 |
"scrolled": true
|
@@ -18,9 +18,7 @@
|
|
18 |
"outputs": [],
|
19 |
"source": [
|
20 |
"%%capture \n",
|
21 |
-
"!pip3 install Twint \n"
|
22 |
-
"#!pip install asyncio\n",
|
23 |
-
"\n"
|
24 |
]
|
25 |
},
|
26 |
{
|
@@ -33,123 +31,209 @@
|
|
33 |
},
|
34 |
{
|
35 |
"cell_type": "code",
|
36 |
-
"execution_count":
|
37 |
"id": "1413ab2b",
|
38 |
"metadata": {},
|
39 |
"outputs": [],
|
40 |
"source": [
|
41 |
-
"import asyncio\n",
|
42 |
-
"import os\n",
|
43 |
-
"loop = asyncio.get_event_loop()\n",
|
44 |
-
"loop.is_running()\n",
|
45 |
-
"import twint\n",
|
46 |
-
"import nest_asyncio\n",
|
47 |
-
"nest_asyncio.apply()"
|
48 |
]
|
49 |
},
|
50 |
{
|
51 |
"cell_type": "code",
|
52 |
-
"execution_count":
|
53 |
-
"id": "
|
54 |
"metadata": {},
|
55 |
-
"outputs": [
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
}
|
67 |
-
],
|
68 |
"source": [
|
69 |
"from_date=\"2022-6-10 10:30:22\"\n",
|
70 |
"to_date= \"2022-6-30\"\n",
|
71 |
"num_tweets = 20\n",
|
72 |
-
"_data=get_tweets(\"jimmieakesson\",
|
73 |
-
"\n"
|
74 |
]
|
75 |
},
|
76 |
{
|
77 |
"cell_type": "code",
|
78 |
"execution_count": null,
|
79 |
-
"id": "
|
80 |
"metadata": {},
|
81 |
-
"outputs": [
|
82 |
-
{
|
83 |
-
"data": {
|
84 |
-
"text/plain": [
|
85 |
-
"9"
|
86 |
-
]
|
87 |
-
},
|
88 |
-
"execution_count": 17,
|
89 |
-
"metadata": {},
|
90 |
-
"output_type": "execute_result"
|
91 |
-
}
|
92 |
-
],
|
93 |
"source": [
|
94 |
-
"tweets=
|
95 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
]
|
97 |
},
|
98 |
{
|
99 |
"cell_type": "code",
|
100 |
"execution_count": null,
|
101 |
-
"id": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
"metadata": {},
|
103 |
"outputs": [
|
104 |
{
|
105 |
-
"
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
113 |
}
|
114 |
],
|
115 |
"source": [
|
116 |
-
"
|
|
|
|
|
117 |
]
|
118 |
},
|
119 |
{
|
120 |
"cell_type": "code",
|
121 |
-
"execution_count":
|
122 |
-
"id": "
|
123 |
"metadata": {},
|
124 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
"source": [
|
126 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
]
|
128 |
},
|
129 |
{
|
130 |
"cell_type": "code",
|
131 |
-
"execution_count":
|
132 |
-
"id": "
|
133 |
"metadata": {},
|
134 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
"source": [
|
136 |
-
"from_date
|
137 |
-
"to_date= \"2022-6-30\"\n",
|
138 |
-
"num_tweets = 20\n",
|
139 |
-
"_data=scrape.scraper.get_tweets(\"jimmieakesson\",u_or_s=\"u\")\n"
|
140 |
]
|
141 |
},
|
142 |
{
|
143 |
"cell_type": "code",
|
144 |
"execution_count": null,
|
145 |
-
"id": "
|
146 |
"metadata": {},
|
147 |
"outputs": [],
|
148 |
-
"source": [
|
149 |
-
"tweets= df[\"tweet\"]\n",
|
150 |
-
"for i in tweets:\n",
|
151 |
-
" print(i, \"\\n\", \"__________________________________________________________\")"
|
152 |
-
]
|
153 |
}
|
154 |
],
|
155 |
"metadata": {
|
|
|
10 |
},
|
11 |
{
|
12 |
"cell_type": "code",
|
13 |
+
"execution_count": null,
|
14 |
"id": "c9021300",
|
15 |
"metadata": {
|
16 |
"scrolled": true
|
|
|
18 |
"outputs": [],
|
19 |
"source": [
|
20 |
"%%capture \n",
|
21 |
+
"!pip3 install Twint \n"
|
|
|
|
|
22 |
]
|
23 |
},
|
24 |
{
|
|
|
31 |
},
|
32 |
{
|
33 |
"cell_type": "code",
|
34 |
+
"execution_count": null,
|
35 |
"id": "1413ab2b",
|
36 |
"metadata": {},
|
37 |
"outputs": [],
|
38 |
"source": [
|
39 |
+
"# import asyncio\n",
|
40 |
+
"# import os\n",
|
41 |
+
"# loop = asyncio.get_event_loop()\n",
|
42 |
+
"# loop.is_running()\n",
|
43 |
+
"# import twint\n",
|
44 |
+
"# import nest_asyncio\n",
|
45 |
+
"# nest_asyncio.apply()"
|
46 |
]
|
47 |
},
|
48 |
{
|
49 |
"cell_type": "code",
|
50 |
+
"execution_count": null,
|
51 |
+
"id": "d38514f3",
|
52 |
"metadata": {},
|
53 |
+
"outputs": [],
|
54 |
+
"source": [
|
55 |
+
"import scrape\n"
|
56 |
+
]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"cell_type": "code",
|
60 |
+
"execution_count": null,
|
61 |
+
"id": "a7912a91",
|
62 |
+
"metadata": {},
|
63 |
+
"outputs": [],
|
|
|
|
|
64 |
"source": [
|
65 |
"from_date=\"2022-6-10 10:30:22\"\n",
|
66 |
"to_date= \"2022-6-30\"\n",
|
67 |
"num_tweets = 20\n",
|
68 |
+
"_data=scrape.scraper.get_tweets(\"jimmieakesson\",u_or_s=\"u\",from_date=221232,to_date=2313)\n"
|
|
|
69 |
]
|
70 |
},
|
71 |
{
|
72 |
"cell_type": "code",
|
73 |
"execution_count": null,
|
74 |
+
"id": "48d50b46",
|
75 |
"metadata": {},
|
76 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
"source": [
|
78 |
+
"tweets= _data.keys()\n",
|
79 |
+
"for i in tweets:\n",
|
80 |
+
" _data[i][\"tweet\"]\n",
|
81 |
+
" print(_data[i][\"tweet\"], \"\\n\", \"__________________________________________________________\")"
|
82 |
+
]
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"cell_type": "code",
|
86 |
+
"execution_count": null,
|
87 |
+
"id": "72cabcb5",
|
88 |
+
"metadata": {},
|
89 |
+
"outputs": [],
|
90 |
+
"source": [
|
91 |
+
"from_date=\"2022-6-10 10:30:22\"\n",
|
92 |
+
"to_date= \"2022-6-30\"\n",
|
93 |
+
"num_tweets = 20\n",
|
94 |
+
"_data=scrape.scraper.string_search_user_tweets(\"jimmieakesson\",\"invandring\")\n"
|
95 |
]
|
96 |
},
|
97 |
{
|
98 |
"cell_type": "code",
|
99 |
"execution_count": null,
|
100 |
+
"id": "549e4fb3",
|
101 |
+
"metadata": {},
|
102 |
+
"outputs": [],
|
103 |
+
"source": [
|
104 |
+
"tweets= _data[\"tweet\"]\n",
|
105 |
+
"for i in tweets:\n",
|
106 |
+
" print(i, \"\\n\", \"__________________________________________________________\")"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"cell_type": "code",
|
111 |
+
"execution_count": 3,
|
112 |
+
"id": "733dd44a",
|
113 |
"metadata": {},
|
114 |
"outputs": [
|
115 |
{
|
116 |
+
"name": "stdout",
|
117 |
+
"output_type": "stream",
|
118 |
+
"text": [
|
119 |
+
"Defaulting to user installation because normal site-packages is not writeable\n",
|
120 |
+
"Requirement already satisfied: snscrape in /home/oxygen/.local/lib/python3.10/site-packages (0.3.4)\n",
|
121 |
+
"Requirement already satisfied: beautifulsoup4 in /home/oxygen/.local/lib/python3.10/site-packages (from snscrape) (4.11.1)\n",
|
122 |
+
"Requirement already satisfied: requests[socks] in /usr/lib/python3/dist-packages (from snscrape) (2.25.1)\n",
|
123 |
+
"Requirement already satisfied: lxml in /usr/lib/python3/dist-packages (from snscrape) (4.8.0)\n",
|
124 |
+
"Requirement already satisfied: soupsieve>1.2 in /home/oxygen/.local/lib/python3.10/site-packages (from beautifulsoup4->snscrape) (2.3.2.post1)\n",
|
125 |
+
"Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /home/oxygen/.local/lib/python3.10/site-packages (from requests[socks]->snscrape) (1.7.1)\n"
|
126 |
+
]
|
127 |
}
|
128 |
],
|
129 |
"source": [
|
130 |
+
"#%pip install -q snscrape==0.3.4\n",
|
131 |
+
"!pip3 install snscrape\n",
|
132 |
+
"#!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git"
|
133 |
]
|
134 |
},
|
135 |
{
|
136 |
"cell_type": "code",
|
137 |
+
"execution_count": 14,
|
138 |
+
"id": "0d16422c",
|
139 |
"metadata": {},
|
140 |
+
"outputs": [
|
141 |
+
{
|
142 |
+
"name": "stdout",
|
143 |
+
"output_type": "stream",
|
144 |
+
"text": [
|
145 |
+
"Note: you may need to restart the kernel to use updated packages.\n"
|
146 |
+
]
|
147 |
+
}
|
148 |
+
],
|
149 |
"source": [
|
150 |
+
"%pip install -q snscrape==0.3.4\n",
|
151 |
+
"from datetime import date\n",
|
152 |
+
"import os\n",
|
153 |
+
"import pandas as pd\n",
|
154 |
+
"\n",
|
155 |
+
"\n",
|
156 |
+
"def get_tweets(search_term, from_date, to_date=date.today(), num_tweets=100,u_or_s='s'):\n",
|
157 |
+
" if u_or_s.lower() =='u':\n",
|
158 |
+
" extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-user '{search_term} until:{to_date}' > extracted-tweets.txt\" \n",
|
159 |
+
" else:\n",
|
160 |
+
" extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-search '{search_term} until:{to_date}' > extracted-tweets.txt\"\n",
|
161 |
+
" \n",
|
162 |
+
" os.system(extracted_tweets)\n",
|
163 |
+
" if os.stat(\"extracted-tweets.txt\").st_size == 0:\n",
|
164 |
+
" print('No Tweets found')\n",
|
165 |
+
" else:\n",
|
166 |
+
" df = pd.read_csv('extracted-tweets.txt', names=['content'])\n",
|
167 |
+
" data_list=[]\n",
|
168 |
+
" for row in df['content'].iteritems():\n",
|
169 |
+
" temp= str(row[0])+str(row[1])\n",
|
170 |
+
" temp= temp.replace(\"\\'\",\"\")\n",
|
171 |
+
" data_list.append(temp)\n",
|
172 |
+
" return data_list\n",
|
173 |
+
"\n"
|
174 |
]
|
175 |
},
|
176 |
{
|
177 |
"cell_type": "code",
|
178 |
+
"execution_count": 12,
|
179 |
+
"id": "8e2adb35",
|
180 |
"metadata": {},
|
181 |
+
"outputs": [
|
182 |
+
{
|
183 |
+
"name": "stdout",
|
184 |
+
"output_type": "stream",
|
185 |
+
"text": [
|
186 |
+
"No Tweets found\n"
|
187 |
+
]
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"name": "stderr",
|
191 |
+
"output_type": "stream",
|
192 |
+
"text": [
|
193 |
+
"Traceback (most recent call last):\n",
|
194 |
+
" File \"/home/oxygen/.local/bin/snscrape\", line 8, in <module>\n",
|
195 |
+
" sys.exit(main())\n",
|
196 |
+
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 224, in main\n",
|
197 |
+
" args = parse_args()\n",
|
198 |
+
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 159, in parse_args\n",
|
199 |
+
" import snscrape.modules\n",
|
200 |
+
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 15, in <module>\n",
|
201 |
+
" _import_modules()\n",
|
202 |
+
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 12, in _import_modules\n",
|
203 |
+
" module = importlib.import_module(moduleName)\n",
|
204 |
+
" File \"/usr/lib/python3.10/importlib/__init__.py\", line 126, in import_module\n",
|
205 |
+
" return _bootstrap._gcd_import(name[level:], package, level)\n",
|
206 |
+
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/instagram.py\", line 12, in <module>\n",
|
207 |
+
" class InstagramPost(typing.NamedTuple, snscrape.base.Item):\n",
|
208 |
+
" File \"/usr/lib/python3.10/typing.py\", line 2329, in _namedtuple_mro_entries\n",
|
209 |
+
" raise TypeError(\"Multiple inheritance with NamedTuple is not supported\")\n",
|
210 |
+
"TypeError: Multiple inheritance with NamedTuple is not supported\n"
|
211 |
+
]
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"ename": "UnboundLocalError",
|
215 |
+
"evalue": "local variable 'df' referenced before assignment",
|
216 |
+
"output_type": "error",
|
217 |
+
"traceback": [
|
218 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
219 |
+
"\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
|
220 |
+
"\u001b[0;32m/tmp/ipykernel_26511/1892081786.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0md\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mget_tweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"jimmieakesson\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfrom_date\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;34m\"2022-06-01\"\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0mnum_tweets\u001b[0m \u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu_or_s\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"u\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
221 |
+
"\u001b[0;32m/tmp/ipykernel_26511/275462205.py\u001b[0m in \u001b[0;36mget_tweets\u001b[0;34m(search_term, from_date, to_date, num_tweets, u_or_s)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'extracted-tweets.txt'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mdata_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miteritems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 20\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\'\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
222 |
+
"\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'df' referenced before assignment"
|
223 |
+
]
|
224 |
+
}
|
225 |
+
],
|
226 |
"source": [
|
227 |
+
"d= get_tweets(\"jimmieakesson\",from_date= \"2022-06-01\" ,num_tweets =5, u_or_s=\"u\")"
|
|
|
|
|
|
|
228 |
]
|
229 |
},
|
230 |
{
|
231 |
"cell_type": "code",
|
232 |
"execution_count": null,
|
233 |
+
"id": "a2c837f4",
|
234 |
"metadata": {},
|
235 |
"outputs": [],
|
236 |
+
"source": []
|
|
|
|
|
|
|
|
|
237 |
}
|
238 |
],
|
239 |
"metadata": {
|