Spaces:
Sleeping
Sleeping
File size: 1,514 Bytes
59b8c1b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import urllib.request
import re
import json
import urllib.parse
from urllib.parse import urlsplit, quote
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
url = 'https://pokemon.fandom.com/ko/wiki/이상해씨_(포켓몬)'
url_info = urlsplit(url)
encoded_url = f'{url_info.scheme}://{url_info.netloc}{quote(url_info.path)}'
info = []
erros = []
target_number = 1017
cnt = 0
for _ in tqdm(range(target_number+2)):
cnt += 1
req = Request(encoded_url, headers={'User-Agent': 'Mozilla/5.0'})
res = urlopen(req)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
name = soup.find("div", {"class": "name-ko"}).text.strip()
number = soup.find("div", {"class": "index"}).text.strip()
doc_text = '\n'.join([p.text.replace('\n', '').strip() for p in soup.find_all("p")])
types = [poke_type['title'].split(' ')[0].strip() for poke_type in soup.select('tbody > tr > td > div')[0].select('span > a')]
evol_tables = soup.find("table", style=re.compile("^margin:auto; text-align:center;"))
info.append(dict(name=name, evolve=[e.span.text for e in evol_tables.find_all("table")]))
next_monster = soup.find("table").findAll("a")[-1]['href']
encoded_url = "https://pokemon.fandom.com" + next_monster
if number == f"No.{target_number:04d}":
break
if cnt >= target_number:
break
with open('pokemon_evolve.json', 'w') as f:
json.dump(info, f, ensure_ascii=False, indent=4) |