|
import urllib.request |
|
import json |
|
import urllib.parse |
|
from urllib.parse import urlsplit, quote |
|
from urllib.request import Request, urlopen |
|
from bs4 import BeautifulSoup |
|
from tqdm import tqdm |
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
url = 'https://pokemon.fandom.com/ko/wiki/이상해씨_(포켓몬)' |
|
|
|
|
|
|
|
url_info = urlsplit(url) |
|
encoded_url = f'{url_info.scheme}://{url_info.netloc}{quote(url_info.path)}' |
|
|
|
info = [] |
|
erros = [] |
|
target_number = 1017 |
|
cnt = 0 |
|
for _ in tqdm(range(target_number+2)): |
|
cnt += 1 |
|
req = Request(encoded_url, headers={'User-Agent': 'Mozilla/5.0'}) |
|
res = urlopen(req) |
|
html = res.read() |
|
soup = BeautifulSoup(html, 'html.parser') |
|
|
|
name = soup.find("div", {"class": "name-ko"}).text.strip() |
|
number = soup.find("div", {"class": "index"}).text.strip() |
|
try: |
|
img_url = soup.find("div", {"class":"image rounded"}).find("img")['data-src'] |
|
filepath = f"images/{number.replace('.', '_')}_{name}.png" |
|
urllib.request.urlretrieve(img_url, filepath) |
|
except: |
|
filepath = None |
|
doc_text = '\n'.join([p.text.replace('\n', '').strip() for p in soup.find_all("p")]) |
|
types = [poke_type['title'].split(' ')[0].strip() for poke_type in soup.select('tbody > tr > td > div')[0].select('span > a')] |
|
|
|
info.append(dict( |
|
name=name, |
|
number=number, |
|
types=types, |
|
doc_text=doc_text, |
|
image_path=filepath, |
|
url=encoded_url |
|
)) |
|
next_monster = soup.find("table").findAll("a")[-1]['href'] |
|
encoded_url = "https://pokemon.fandom.com" + next_monster |
|
if number == f"No.{target_number:04d}": |
|
break |
|
|
|
if cnt >= target_number: |
|
break |
|
|
|
pd.DataFrame(info).to_csv('pokemon.csv', index=False) |
|
with open('pokemon.json', 'w') as f: |
|
json.dump(info, f, ensure_ascii=False, indent=4) |