arjunpatel commited on
Commit
055ef85
1 Parent(s): 1b1e129

First commit

Browse files

A script that can grab a movelist from Serebii.net. Only 3 webpages are needed, to minimize requests. Woohoo!

Files changed (1) hide show
  1. move_scraper.ipynb +415 -0
move_scraper.ipynb ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Pokemon Attack Scraping Script"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 3,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "import pandas as pd\n",
17
+ "import requests\n",
18
+ "from bs4 import BeautifulSoup\n",
19
+ "\n",
20
+ "\n",
21
+ "\n",
22
+ "\n",
23
+ "physical_moves = \"https://www.serebii.net/attackdex-swsh/physical.shtml\" \n",
24
+ "special_moves = \"https://www.serebii.net/attackdex-swsh/special.shtml\"\n",
25
+ "status_moves = \"https://www.serebii.net/attackdex-swsh/other.shtml\""
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 4,
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "\n",
35
+ "\n",
36
+ "data = requests.get(physical_moves)"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 29,
42
+ "metadata": {},
43
+ "outputs": [],
44
+ "source": [
45
+ "my_data = []\n",
46
+ " #https://www.kite.com/python/examples/4420/beautifulsoup-parse-an-html-table-and-write-to-a-csv\n",
47
+ "html = BeautifulSoup(data.text, 'html.parser')\n"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 83,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "\n",
57
+ "soup = html\n",
58
+ "table = soup.find_all(\"table\")[1]\n",
59
+ "\n",
60
+ "output_rows = []\n",
61
+ "for table_row in table.findAll('tr'):\n",
62
+ " columns = table_row.findAll('td')\n",
63
+ " output_row = []\n",
64
+ " for column in columns:\n",
65
+ " output_row.append(column.text.strip())\n",
66
+ " output_rows.append(output_row)\n",
67
+ "\n"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 84,
73
+ "metadata": {},
74
+ "outputs": [],
75
+ "source": [
76
+ "moves = pd.DataFrame(output_rows)"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": null,
82
+ "metadata": {},
83
+ "outputs": [],
84
+ "source": []
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": 85,
89
+ "metadata": {},
90
+ "outputs": [],
91
+ "source": [
92
+ "moves.columns = moves.iloc[0]"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 86,
98
+ "metadata": {},
99
+ "outputs": [],
100
+ "source": [
101
+ "moves = moves[1:]"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 87,
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "moves = moves[moves.Effect.apply(lambda x: \"This move can't be used\" not in x)]"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 88,
116
+ "metadata": {},
117
+ "outputs": [
118
+ {
119
+ "data": {
120
+ "text/html": [
121
+ "<div>\n",
122
+ "<style scoped>\n",
123
+ " .dataframe tbody tr th:only-of-type {\n",
124
+ " vertical-align: middle;\n",
125
+ " }\n",
126
+ "\n",
127
+ " .dataframe tbody tr th {\n",
128
+ " vertical-align: top;\n",
129
+ " }\n",
130
+ "\n",
131
+ " .dataframe thead th {\n",
132
+ " text-align: right;\n",
133
+ " }\n",
134
+ "</style>\n",
135
+ "<table border=\"1\" class=\"dataframe\">\n",
136
+ " <thead>\n",
137
+ " <tr style=\"text-align: right;\">\n",
138
+ " <th></th>\n",
139
+ " <th>Name</th>\n",
140
+ " <th>Type</th>\n",
141
+ " <th>Cat.</th>\n",
142
+ " <th>PP</th>\n",
143
+ " <th>Att.</th>\n",
144
+ " <th>Acc.</th>\n",
145
+ " <th>Effect</th>\n",
146
+ " </tr>\n",
147
+ " </thead>\n",
148
+ " <tbody>\n",
149
+ " <tr>\n",
150
+ " <th>1</th>\n",
151
+ " <td>Accelerock</td>\n",
152
+ " <td></td>\n",
153
+ " <td></td>\n",
154
+ " <td>20</td>\n",
155
+ " <td>40</td>\n",
156
+ " <td>100</td>\n",
157
+ " <td>The user smashes into the target at high speed...</td>\n",
158
+ " </tr>\n",
159
+ " <tr>\n",
160
+ " <th>3</th>\n",
161
+ " <td>Acrobatics</td>\n",
162
+ " <td></td>\n",
163
+ " <td></td>\n",
164
+ " <td>15</td>\n",
165
+ " <td>55</td>\n",
166
+ " <td>100</td>\n",
167
+ " <td>The user nimbly strikes the target. If the use...</td>\n",
168
+ " </tr>\n",
169
+ " <tr>\n",
170
+ " <th>4</th>\n",
171
+ " <td>Aerial Ace</td>\n",
172
+ " <td></td>\n",
173
+ " <td></td>\n",
174
+ " <td>20</td>\n",
175
+ " <td>60</td>\n",
176
+ " <td>101</td>\n",
177
+ " <td>The user confounds the target with speed, then...</td>\n",
178
+ " </tr>\n",
179
+ " <tr>\n",
180
+ " <th>6</th>\n",
181
+ " <td>Anchor Shot</td>\n",
182
+ " <td></td>\n",
183
+ " <td></td>\n",
184
+ " <td>20</td>\n",
185
+ " <td>80</td>\n",
186
+ " <td>100</td>\n",
187
+ " <td>The user entangles the target with its anchor ...</td>\n",
188
+ " </tr>\n",
189
+ " <tr>\n",
190
+ " <th>7</th>\n",
191
+ " <td>Aqua Jet</td>\n",
192
+ " <td></td>\n",
193
+ " <td></td>\n",
194
+ " <td>20</td>\n",
195
+ " <td>40</td>\n",
196
+ " <td>100</td>\n",
197
+ " <td>The user lunges at the target at a speed that ...</td>\n",
198
+ " </tr>\n",
199
+ " <tr>\n",
200
+ " <th>...</th>\n",
201
+ " <td>...</td>\n",
202
+ " <td>...</td>\n",
203
+ " <td>...</td>\n",
204
+ " <td>...</td>\n",
205
+ " <td>...</td>\n",
206
+ " <td>...</td>\n",
207
+ " <td>...</td>\n",
208
+ " </tr>\n",
209
+ " <tr>\n",
210
+ " <th>381</th>\n",
211
+ " <td>Wood Hammer</td>\n",
212
+ " <td></td>\n",
213
+ " <td></td>\n",
214
+ " <td>15</td>\n",
215
+ " <td>120</td>\n",
216
+ " <td>100</td>\n",
217
+ " <td>The user slams its rugged body into the target...</td>\n",
218
+ " </tr>\n",
219
+ " <tr>\n",
220
+ " <th>382</th>\n",
221
+ " <td>Wrap</td>\n",
222
+ " <td></td>\n",
223
+ " <td></td>\n",
224
+ " <td>20</td>\n",
225
+ " <td>15</td>\n",
226
+ " <td>90</td>\n",
227
+ " <td>A long body, vines, or the like are used to wr...</td>\n",
228
+ " </tr>\n",
229
+ " <tr>\n",
230
+ " <th>383</th>\n",
231
+ " <td>X-Scissor</td>\n",
232
+ " <td></td>\n",
233
+ " <td></td>\n",
234
+ " <td>15</td>\n",
235
+ " <td>80</td>\n",
236
+ " <td>100</td>\n",
237
+ " <td>The user slashes at the target by crossing its...</td>\n",
238
+ " </tr>\n",
239
+ " <tr>\n",
240
+ " <th>384</th>\n",
241
+ " <td>Zen Headbutt</td>\n",
242
+ " <td></td>\n",
243
+ " <td></td>\n",
244
+ " <td>15</td>\n",
245
+ " <td>80</td>\n",
246
+ " <td>90</td>\n",
247
+ " <td>The user focuses its willpower to its head and...</td>\n",
248
+ " </tr>\n",
249
+ " <tr>\n",
250
+ " <th>385</th>\n",
251
+ " <td>Zing Zap</td>\n",
252
+ " <td></td>\n",
253
+ " <td></td>\n",
254
+ " <td>10</td>\n",
255
+ " <td>80</td>\n",
256
+ " <td>100</td>\n",
257
+ " <td>A strong electric blast crashes down on the ta...</td>\n",
258
+ " </tr>\n",
259
+ " </tbody>\n",
260
+ "</table>\n",
261
+ "<p>322 rows × 7 columns</p>\n",
262
+ "</div>"
263
+ ],
264
+ "text/plain": [
265
+ "0 Name Type Cat. PP Att. Acc. \\\n",
266
+ "1 Accelerock 20 40 100 \n",
267
+ "3 Acrobatics 15 55 100 \n",
268
+ "4 Aerial Ace 20 60 101 \n",
269
+ "6 Anchor Shot 20 80 100 \n",
270
+ "7 Aqua Jet 20 40 100 \n",
271
+ ".. ... ... ... .. ... ... \n",
272
+ "381 Wood Hammer 15 120 100 \n",
273
+ "382 Wrap 20 15 90 \n",
274
+ "383 X-Scissor 15 80 100 \n",
275
+ "384 Zen Headbutt 15 80 90 \n",
276
+ "385 Zing Zap 10 80 100 \n",
277
+ "\n",
278
+ "0 Effect \n",
279
+ "1 The user smashes into the target at high speed... \n",
280
+ "3 The user nimbly strikes the target. If the use... \n",
281
+ "4 The user confounds the target with speed, then... \n",
282
+ "6 The user entangles the target with its anchor ... \n",
283
+ "7 The user lunges at the target at a speed that ... \n",
284
+ ".. ... \n",
285
+ "381 The user slams its rugged body into the target... \n",
286
+ "382 A long body, vines, or the like are used to wr... \n",
287
+ "383 The user slashes at the target by crossing its... \n",
288
+ "384 The user focuses its willpower to its head and... \n",
289
+ "385 A strong electric blast crashes down on the ta... \n",
290
+ "\n",
291
+ "[322 rows x 7 columns]"
292
+ ]
293
+ },
294
+ "execution_count": 88,
295
+ "metadata": {},
296
+ "output_type": "execute_result"
297
+ }
298
+ ],
299
+ "source": [
300
+ "moves"
301
+ ]
302
+ },
303
+ {
304
+ "cell_type": "code",
305
+ "execution_count": 90,
306
+ "metadata": {},
307
+ "outputs": [
308
+ {
309
+ "name": "stdout",
310
+ "output_type": "stream",
311
+ "text": [
312
+ "Removing some old moves... Found 386\n",
313
+ "Removing some old moves... Found 237\n",
314
+ "Removing some old moves... Found 260\n"
315
+ ]
316
+ }
317
+ ],
318
+ "source": [
319
+ "def create_moves_df(url):\n",
320
+ " # given a url, scrapes the moves table and turns it into a pandas df\n",
321
+ " # works on Serebii's moves attackdex\n",
322
+ " # removes moves that no longer are usable in that generation\n",
323
+ " data = requests.get(url)\n",
324
+ " soup = BeautifulSoup(data.text, 'html.parser')\n",
325
+ " \n",
326
+ " # first table is the table of contents for the moves, we don't want that\n",
327
+ " \n",
328
+ " # following code grabbed from #https://www.kite.com/python/examples/4420/beautifulsoup-parse-an-html-table-and-write-to-a-csv\n",
329
+ " table = soup.find_all(\"table\")[1]\n",
330
+ "\n",
331
+ " output_rows = []\n",
332
+ " for table_row in table.findAll('tr'):\n",
333
+ " columns = table_row.findAll('td')\n",
334
+ " output_row = []\n",
335
+ " for column in columns:\n",
336
+ " output_row.append(column.text.strip())\n",
337
+ " output_rows.append(output_row)\n",
338
+ " \n",
339
+ " moves = pd.DataFrame(output_rows)\n",
340
+ " \n",
341
+ " # move first row to column, and drop row\n",
342
+ " moves.columns = moves.iloc[0]\n",
343
+ " moves = moves[1:].reset_index(drop = True)\n",
344
+ " \n",
345
+ " # drop moves that don't exist in this gen, but also count them\n",
346
+ " unusable_moves = len(moves.Effect.apply(lambda x: \"This move can't be used\" in x))\n",
347
+ " print(\"Removing some old moves... Found \", unusable_moves)\n",
348
+ " moves = moves[moves.Effect.apply(lambda x: \"This move can't be used\" not in x)]\n",
349
+ " return moves\n",
350
+ "\n",
351
+ "\n",
352
+ "\n",
353
+ "\n",
354
+ "physical_moves = \"https://www.serebii.net/attackdex-swsh/physical.shtml\" \n",
355
+ "special_moves = \"https://www.serebii.net/attackdex-swsh/special.shtml\"\n",
356
+ "status_moves = \"https://www.serebii.net/attackdex-swsh/other.shtml\"\n",
357
+ "\n",
358
+ "\n",
359
+ "physical_df = create_moves_df(physical_moves)\n",
360
+ "special_df = create_moves_df(special_moves)\n",
361
+ "status_df = create_moves_df(status_moves)\n",
362
+ "\n",
363
+ "moves = pd.concat([physical_df, special_df, status_df])\n"
364
+ ]
365
+ },
366
+ {
367
+ "cell_type": "code",
368
+ "execution_count": 93,
369
+ "metadata": {},
370
+ "outputs": [
371
+ {
372
+ "data": {
373
+ "text/plain": [
374
+ "743"
375
+ ]
376
+ },
377
+ "execution_count": 93,
378
+ "metadata": {},
379
+ "output_type": "execute_result"
380
+ }
381
+ ],
382
+ "source": [
383
+ "len(moves)"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": null,
389
+ "metadata": {},
390
+ "outputs": [],
391
+ "source": []
392
+ }
393
+ ],
394
+ "metadata": {
395
+ "kernelspec": {
396
+ "display_name": "Python 3",
397
+ "language": "python",
398
+ "name": "python3"
399
+ },
400
+ "language_info": {
401
+ "codemirror_mode": {
402
+ "name": "ipython",
403
+ "version": 3
404
+ },
405
+ "file_extension": ".py",
406
+ "mimetype": "text/x-python",
407
+ "name": "python",
408
+ "nbconvert_exporter": "python",
409
+ "pygments_lexer": "ipython3",
410
+ "version": "3.8.3"
411
+ }
412
+ },
413
+ "nbformat": 4,
414
+ "nbformat_minor": 4
415
+ }