pokemon / make_dataset.py
Yoon-gu Hwang
upload files
6cd90ae
raw
history blame
No virus
2.54 kB
import urllib.request
import json
import urllib.parse
from urllib.parse import urlsplit, quote
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
# url = 'https://pokemon.fandom.com/ko/wiki/흥나숭_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/나몰빼미_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/도치마론_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/비크티니_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/모부기_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/나무지기_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/치코리타_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/토게틱_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/포푸니_(포켓몬)'
url = 'https://pokemon.fandom.com/ko/wiki/이상해씨_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/레트라_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/신비록_(포켓몬)'
url_info = urlsplit(url)
encoded_url = f'{url_info.scheme}://{url_info.netloc}{quote(url_info.path)}'
info = []
erros = []
target_number = 1017
cnt = 0
for _ in tqdm(range(target_number+2)):
cnt += 1
req = Request(encoded_url, headers={'User-Agent': 'Mozilla/5.0'})
res = urlopen(req)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
name = soup.find("div", {"class": "name-ko"}).text.strip()
number = soup.find("div", {"class": "index"}).text.strip()
try:
img_url = soup.find("div", {"class":"image rounded"}).find("img")['data-src']
filepath = f"images/{number.replace('.', '_')}_{name}.png"
urllib.request.urlretrieve(img_url, filepath)
except:
filepath = None
doc_text = '\n'.join([p.text.replace('\n', '').strip() for p in soup.find_all("p")])
types = [poke_type['title'].split(' ')[0].strip() for poke_type in soup.select('tbody > tr > td > div')[0].select('span > a')]
info.append(dict(
name=name,
number=number,
types=types,
doc_text=doc_text,
image_path=filepath,
url=encoded_url
))
next_monster = soup.find("table").findAll("a")[-1]['href']
encoded_url = "https://pokemon.fandom.com" + next_monster
if number == f"No.{target_number:04d}":
break
if cnt >= target_number:
break
pd.DataFrame(info).to_csv('pokemon.csv', index=False)
with open('pokemon.json', 'w') as f:
json.dump(info, f, ensure_ascii=False, indent=4)